From e9f7f9a4bc38fb490a43b46d3d70234a9ee44039 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 1 Mar 2021 16:32:31 +0100
Subject: [PATCH 001/146] Fix is_cython_func for additional imported code

* Fix `is_cython_func` for imported code loaded under `python_code`
module name
* Add `make_named_tempfile` context manager to test utils to test
loading of imported code
* Add test for validation of `initialize` params in custom module
---
 spacy/tests/test_misc.py | 35 +++++++++++++++++++++++++++++++++--
 spacy/tests/util.py      |  7 +++++++
 spacy/util.py            |  7 ++++---
 3 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index e694baa40..99816d40f 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -7,7 +7,7 @@ from spacy import util
 from spacy import prefer_gpu, require_gpu, require_cpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
-from spacy.util import dot_to_object, SimpleFrozenList
+from spacy.util import dot_to_object, SimpleFrozenList, import_file
 from thinc.api import Config, Optimizer, ConfigValidationError
 from spacy.training.batchers import minibatch_by_words
 from spacy.lang.en import English
@@ -17,7 +17,7 @@ from spacy.schemas import ConfigSchemaTraining
 
 from thinc.api import get_current_ops, NumpyOps, CupyOps
 
-from .util import get_random_doc
+from .util import get_random_doc, make_named_tempfile
 
 
 @pytest.fixture
@@ -347,3 +347,34 @@ def test_resolve_dot_names():
     errors = e.value.errors
     assert len(errors) == 1
     assert errors[0]["loc"] == ["training", "xyz"]
+
+
+def test_import_code():
+    code_str = """
+from spacy import Language
+
+class DummyComponent:
+    def __init__(self, vocab, name):
+        pass
+
+    def initialize(self, get_examples, *, nlp, dummy_param: int):
+        pass
+
+@Language.factory(
+    "dummy_component",
+)
+def make_dummy_component(
+    nlp: Language, name: str
+):
+    return DummyComponent(nlp.vocab, name)
+"""
+
+    with make_named_tempfile(mode="w", suffix=".py") as fileh:
+        fileh.write(code_str)
+        fileh.flush()
+
+        import_file("python_code", fileh.name)
+        config = {"initialize": {"components": {"dummy_component": {"dummy_param": 1}}}}
+        nlp = English.from_config(config)
+        nlp.add_pipe("dummy_component")
+        nlp.initialize()
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index ef7b4d00d..475db4453 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -14,6 +14,13 @@ def make_tempfile(mode="r"):
     f.close()
 
 
+@contextlib.contextmanager
+def make_named_tempfile(mode="r", suffix=None):
+    f = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
+    yield f
+    f.close()
+
+
 def get_batch(batch_size):
     vocab = Vocab()
     docs = []
diff --git a/spacy/util.py b/spacy/util.py
index ce1022d25..bcb51fe7d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1454,9 +1454,10 @@ def is_cython_func(func: Callable) -> bool:
     if hasattr(func, attr):  # function or class instance
         return True
     # https://stackoverflow.com/a/55767059
-    if hasattr(func, "__qualname__") and hasattr(func, "__module__"):  # method
-        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
-        return hasattr(cls_func, attr)
+    if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
+        and func.__module__ in sys.modules:  # method
+            cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+            return hasattr(cls_func, attr)
     return False
 
 

From 0efb7413f9642608fcd295f1aad740154dc3744a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 1 Mar 2021 17:54:14 +0100
Subject: [PATCH 002/146] Use make_tempdir instead

---
 spacy/tests/test_misc.py | 11 ++++++-----
 spacy/tests/util.py      |  7 -------
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 99816d40f..58bebc4ca 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -17,7 +17,7 @@ from spacy.schemas import ConfigSchemaTraining
 
 from thinc.api import get_current_ops, NumpyOps, CupyOps
 
-from .util import get_random_doc, make_named_tempfile
+from .util import get_random_doc, make_tempdir
 
 
 @pytest.fixture
@@ -369,11 +369,12 @@ def make_dummy_component(
     return DummyComponent(nlp.vocab, name)
 """
 
-    with make_named_tempfile(mode="w", suffix=".py") as fileh:
-        fileh.write(code_str)
-        fileh.flush()
+    with make_tempdir() as temp_dir:
+        code_path = os.path.join(temp_dir, "code.py")
+        with open(code_path, "w") as fileh:
+            fileh.write(code_str)
 
-        import_file("python_code", fileh.name)
+        import_file("python_code", code_path)
         config = {"initialize": {"components": {"dummy_component": {"dummy_param": 1}}}}
         nlp = English.from_config(config)
         nlp.add_pipe("dummy_component")
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index 475db4453..ef7b4d00d 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -14,13 +14,6 @@ def make_tempfile(mode="r"):
     f.close()
 
 
-@contextlib.contextmanager
-def make_named_tempfile(mode="r", suffix=None):
-    f = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
-    yield f
-    f.close()
-
-
 def get_batch(batch_size):
     vocab = Vocab()
     docs = []

From b1945f4e733a948a915ded1f6544ef84e12c9495 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Mar 2021 12:06:59 +0100
Subject: [PATCH 003/146] sync pins with thinc

---
 requirements.txt | 6 +++---
 setup.cfg        | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 85fc6a62c..afd4b8845 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ wasabi>=0.8.1,<1.1.0
 srsly>=2.4.0,<3.0.0
 catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
-pathy
+pathy>=0.3.5
 # Third party dependencies
 numpy>=1.15.0
 requests>=2.13.0,<3.0.0
@@ -21,11 +21,11 @@ jinja2
 setuptools
 packaging>=20.0
 importlib_metadata>=0.20; python_version < "3.8"
-typing_extensions>=3.7.4; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=5.2.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-hypothesis
+hypothesis>=3.27.0,<6.0.0
diff --git a/setup.cfg b/setup.cfg
index 6f8572381..482c1fbdd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
     srsly>=2.4.0,<3.0.0
     catalogue>=2.0.1,<2.1.0
     typer>=0.3.0,<0.4.0
-    pathy
+    pathy>=0.3.5
     # Third-party dependencies
     tqdm>=4.38.0,<5.0.0
     numpy>=1.15.0
@@ -58,7 +58,7 @@ install_requires =
     setuptools
     packaging>=20.0
     importlib_metadata>=0.20; python_version < "3.8"
-    typing_extensions>=3.7.4; python_version < "3.8"
+    typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
 
 [options.entry_points]
 console_scripts =

From d879d30aea10748ad13865cd80530d4aeb9b2889 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Mar 2021 13:20:17 +0100
Subject: [PATCH 004/146] raise hypothesis pin

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index afd4b8845..01a3be120 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -28,4 +28,4 @@ pytest>=5.2.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-hypothesis>=3.27.0,<6.0.0
+hypothesis>=3.27.0,<7.0.0

From 0fddc0447cbc0fee237a76e5b4128d893156490b Mon Sep 17 00:00:00 2001
From: graue70 <23035329+graue70@users.noreply.github.com>
Date: Tue, 2 Mar 2021 14:00:14 +0100
Subject: [PATCH 005/146] Fix copy & paste error in API docs

---
 website/docs/api/language.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 6a8744463..a90476dab 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -364,7 +364,7 @@ Evaluate a pipeline's components.
 
 <Infobox variant="warning" title="Changed in v3.0">
 
-The `Language.update` method now takes a batch of [`Example`](/api/example)
+The `Language.evaluate` method now takes a batch of [`Example`](/api/example)
 objects instead of tuples of `Doc` and `GoldParse` objects.
 
 </Infobox>

From fb98862337144590caf325abf48e80f390f3244b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Mar 2021 15:11:18 +0100
Subject: [PATCH 006/146] Add hint for --gpu-id to CLI device info (#7234)

* Add hint for --gpu-id to CLI device info

If the user has `cupy` and an available GPU, add a hint about using
`--gpu-id 0` to the CLI output.

* Undo change to original CPU message
---
 spacy/cli/_util.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 86b3ab356..228cc622a 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -11,6 +11,7 @@ from click.parser import split_arg_string
 from typer.main import get_command
 from contextlib import contextmanager
 from thinc.api import Config, ConfigValidationError, require_gpu
+from thinc.util import has_cupy, gpu_is_available
 from configparser import InterpolationError
 import os
 
@@ -510,3 +511,5 @@ def setup_gpu(use_gpu: int) -> None:
         require_gpu(use_gpu)
     else:
         msg.info("Using CPU")
+        if has_cupy and gpu_is_available():
+            msg.info("To switch to GPU 0, use the option: --gpu-id 0")

From 212f0e779eeb1e1f66619bcb50739e4dbf90f4d5 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 2 Mar 2021 15:12:54 +0100
Subject: [PATCH 007/146] Support doc.spans in Example.from_dict (#7197)

* add support for spans in Example.from_dict

* add unit tests

* update error to E879
---
 spacy/errors.py                          |  6 +-
 spacy/tests/training/test_new_example.py | 98 ++++++++++++++++++++++++
 spacy/training/example.pyx               | 31 +++++++-
 3 files changed, 130 insertions(+), 5 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index fc98fdaa6..2ebc49e8c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -321,7 +321,8 @@ class Errors:
             "https://spacy.io/api/top-level#util.filter_spans")
     E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
             "token can only be part of one entity, so make sure the entities "
-            "you're setting don't overlap.")
+            "you're setting don't overlap. To work with overlapping entities, "
+            "consider using doc.spans instead.")
     E106 = ("Can't find `doc._.{attr}` attribute specified in the underscore "
             "settings: {opts}")
     E107 = ("Value of `doc._.{attr}` is not JSON-serializable: {value}")
@@ -487,6 +488,9 @@ class Errors:
 
     # New errors added in v3.x
 
+    E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
+            "a list of spans, with each span represented by a tuple (start_char, end_char). "
+            "The tuple can be optionally extended with a label and a KB ID.")
     E880 = ("The 'wandb' library could not be found - did you install it? "
             "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
             "config section, instead of the 'WandbLogger'.")
diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py
index be3419b82..b8fbaf606 100644
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -196,6 +196,104 @@ def test_Example_from_dict_with_entities_invalid(annots):
     assert len(list(example.reference.ents)) == 0
 
 
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "entities": [
+                (7, 15, "LOC"),
+                (11, 15, "LOC"),
+                (20, 26, "LOC"),
+            ],  # overlapping
+        }
+    ],
+)
+def test_Example_from_dict_with_entities_overlapping(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    with pytest.raises(ValueError):
+        Example.from_dict(predicted, annots)
+
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "spans": {
+                "cities": [(7, 15, "LOC"), (20, 26, "LOC")],
+                "people": [(0, 1, "PERSON")],
+            },
+        }
+    ],
+)
+def test_Example_from_dict_with_spans(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    example = Example.from_dict(predicted, annots)
+    assert len(list(example.reference.ents)) == 0
+    assert len(list(example.reference.spans["cities"])) == 2
+    assert len(list(example.reference.spans["people"])) == 1
+    for span in example.reference.spans["cities"]:
+        assert span.label_ == "LOC"
+    for span in example.reference.spans["people"]:
+        assert span.label_ == "PERSON"
+
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "spans": {
+                "cities": [(7, 15, "LOC"), (11, 15, "LOC"), (20, 26, "LOC")],
+                "people": [(0, 1, "PERSON")],
+            },
+        }
+    ],
+)
+def test_Example_from_dict_with_spans_overlapping(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    example = Example.from_dict(predicted, annots)
+    assert len(list(example.reference.ents)) == 0
+    assert len(list(example.reference.spans["cities"])) == 3
+    assert len(list(example.reference.spans["people"])) == 1
+    for span in example.reference.spans["cities"]:
+        assert span.label_ == "LOC"
+    for span in example.reference.spans["people"]:
+        assert span.label_ == "PERSON"
+
+
+@pytest.mark.parametrize(
+    "annots",
+    [
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "spans": [(0, 1, "PERSON")],
+        },
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "spans": {"cities": (7, 15, "LOC")},
+        },
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "spans": {"cities": [7, 11]},
+        },
+        {
+            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
+            "spans": {"cities": [[7]]},
+        },
+    ],
+)
+def test_Example_from_dict_with_spans_invalid(annots):
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    with pytest.raises(ValueError):
+        Example.from_dict(predicted, annots)
+
+
 @pytest.mark.parametrize(
     "annots",
     [
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index dc1c74e8a..9cf825bf9 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -22,6 +22,8 @@ cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
     output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
     if "entities" in doc_annot:
        _add_entities_to_doc(output, doc_annot["entities"])
+    if "spans" in doc_annot:
+       _add_spans_to_doc(output, doc_annot["spans"])
     if array.size:
         output = output.from_array(attrs, array)
     # links are currently added with ENT_KB_ID on the token level
@@ -314,13 +316,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
 
     for key, value in doc_annot.items():
         if value:
-            if key == "entities":
+            if key in ["entities", "cats", "spans"]:
                 pass
             elif key == "links":
                 ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
                 tok_annot["ENT_KB_ID"] = ent_kb_ids
-            elif key == "cats":
-                pass
             else:
                 raise ValueError(Errors.E974.format(obj="doc", key=key))
 
@@ -351,6 +351,29 @@ def _annot2array(vocab, tok_annot, doc_annot):
     return attrs, array.T
 
 
+def _add_spans_to_doc(doc, spans_data):
+    if not isinstance(spans_data, dict):
+        raise ValueError(Errors.E879)
+    for key, span_list in spans_data.items():
+        spans = []
+        if not isinstance(span_list, list):
+            raise ValueError(Errors.E879)
+        for span_tuple in span_list:
+            if not isinstance(span_tuple, (list, tuple)) or len(span_tuple) < 2:
+                raise ValueError(Errors.E879)
+            start_char = span_tuple[0]
+            end_char = span_tuple[1]
+            label = 0
+            kb_id = 0
+            if len(span_tuple) > 2:
+                label = span_tuple[2]
+            if len(span_tuple) > 3:
+                kb_id = span_tuple[3]
+            span = doc.char_span(start_char, end_char, label=label, kb_id=kb_id)
+            spans.append(span)
+        doc.spans[key] = spans
+
+
 def _add_entities_to_doc(doc, ner_data):
     if ner_data is None:
         return
@@ -397,7 +420,7 @@ def _fix_legacy_dict_data(example_dict):
                 pass
             elif key == "ids":
                 pass
-            elif key in ("cats", "links"):
+            elif key in ("cats", "links", "spans"):
                 doc_dict[key] = value
             elif key in ("ner", "entities"):
                 doc_dict["entities"] = value

From 8a4200d4e9f1482f214169ce62b2a8b65789d4d7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 2 Mar 2021 15:53:30 +0100
Subject: [PATCH 008/146] Omit unused tok2vec/transformer components

Omit unused tok2vec/transformer components in quickstart template.
---
 spacy/cli/templates/quickstart_training.jinja | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 89c8ec2d4..38fc23272 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -16,7 +16,11 @@ gpu_allocator = null
 
 [nlp]
 lang = "{{ lang }}"
+{%- if "tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "entity_linker" in components or (("textcat" in components or "textcat_multilabel" in components) and optimize == "accuracy") -%}
 {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
+{%- else -%}
+{%- set full_pipeline = components %}
+{%- endif %}
 pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
 batch_size = {{ 128 if hardware == "gpu" else 1000 }}
 

From d900c5506111c8be4dc14234d7124653ba44996c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Mar 2021 17:56:28 +0100
Subject: [PATCH 009/146] consistently use registry as callable

---
 spacy/ml/models/entity_linker.py               |  8 ++++----
 spacy/ml/models/multi_task.py                  |  4 ++--
 spacy/ml/models/parser.py                      |  4 ++--
 spacy/ml/models/tagger.py                      |  2 +-
 spacy/ml/models/textcat.py                     |  8 ++++----
 spacy/ml/models/tok2vec.py                     | 16 ++++++++--------
 spacy/tests/pipeline/test_entity_linker.py     |  2 +-
 spacy/tests/serialize/test_serialize_config.py |  2 +-
 spacy/tests/serialize/test_serialize_kb.py     |  2 +-
 spacy/tests/test_architectures.py              | 10 +++++-----
 spacy/tests/training/test_readers.py           |  2 +-
 website/docs/usage/layers-architectures.md     | 12 ++++++------
 12 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index f37203b1b..21e1c53b9 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -8,7 +8,7 @@ from ...kb import KnowledgeBase, Candidate, get_candidates
 from ...vocab import Vocab
 
 
-@registry.architectures.register("spacy.EntityLinker.v1")
+@registry.architectures("spacy.EntityLinker.v1")
 def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
     with Model.define_operators({">>": chain, "**": clone}):
         token_width = tok2vec.get_dim("nO")
@@ -25,7 +25,7 @@ def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
     return model
 
 
-@registry.misc.register("spacy.KBFromFile.v1")
+@registry.misc("spacy.KBFromFile.v1")
 def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
     def kb_from_file(vocab):
         kb = KnowledgeBase(vocab, entity_vector_length=1)
@@ -35,7 +35,7 @@ def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
     return kb_from_file
 
 
-@registry.misc.register("spacy.EmptyKB.v1")
+@registry.misc("spacy.EmptyKB.v1")
 def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
     def empty_kb_factory(vocab):
         return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
@@ -43,6 +43,6 @@ def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
     return empty_kb_factory
 
 
-@registry.misc.register("spacy.CandidateGenerator.v1")
+@registry.misc("spacy.CandidateGenerator.v1")
 def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
     return get_candidates
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 7c0589bff..8aa0f3c2b 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
     from ...tokens import Doc  # noqa: F401
 
 
-@registry.architectures.register("spacy.PretrainVectors.v1")
+@registry.architectures("spacy.PretrainVectors.v1")
 def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
@@ -40,7 +40,7 @@ def create_pretrain_vectors(
     return create_vectors_objective
 
 
-@registry.architectures.register("spacy.PretrainCharacters.v1")
+@registry.architectures("spacy.PretrainCharacters.v1")
 def create_pretrain_characters(
     maxout_pieces: int, hidden_size: int, n_characters: int
 ) -> Callable[["Vocab", Model], Model]:
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index da53f562e..861094209 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -10,7 +10,7 @@ from ..tb_framework import TransitionModel
 from ...tokens import Doc
 
 
-@registry.architectures.register("spacy.TransitionBasedParser.v1")
+@registry.architectures("spacy.TransitionBasedParser.v1")
 def transition_parser_v1(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
@@ -31,7 +31,7 @@ def transition_parser_v1(
     )
 
 
-@registry.architectures.register("spacy.TransitionBasedParser.v2")
+@registry.architectures("spacy.TransitionBasedParser.v2")
 def transition_parser_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     state_type: Literal["parser", "ner"],
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 09405214c..87944e305 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -6,7 +6,7 @@ from ...util import registry
 from ...tokens import Doc
 
 
-@registry.architectures.register("spacy.Tagger.v1")
+@registry.architectures("spacy.Tagger.v1")
 def build_tagger_model(
     tok2vec: Model[List[Doc], List[Floats2d]], nO: Optional[int] = None
 ) -> Model[List[Doc], List[Floats2d]]:
diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py
index 0234530e6..a1855c5a0 100644
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@@ -15,7 +15,7 @@ from ...tokens import Doc
 from .tok2vec import get_tok2vec_width
 
 
-@registry.architectures.register("spacy.TextCatCNN.v1")
+@registry.architectures("spacy.TextCatCNN.v1")
 def build_simple_cnn_text_classifier(
     tok2vec: Model, exclusive_classes: bool, nO: Optional[int] = None
 ) -> Model[List[Doc], Floats2d]:
@@ -41,7 +41,7 @@ def build_simple_cnn_text_classifier(
     return model
 
 
-@registry.architectures.register("spacy.TextCatBOW.v1")
+@registry.architectures("spacy.TextCatBOW.v1")
 def build_bow_text_classifier(
     exclusive_classes: bool,
     ngram_size: int,
@@ -60,7 +60,7 @@ def build_bow_text_classifier(
     return model
 
 
-@registry.architectures.register("spacy.TextCatEnsemble.v2")
+@registry.architectures("spacy.TextCatEnsemble.v2")
 def build_text_classifier_v2(
     tok2vec: Model[List[Doc], List[Floats2d]],
     linear_model: Model[List[Doc], Floats2d],
@@ -112,7 +112,7 @@ def init_ensemble_textcat(model, X, Y) -> Model:
     return model
 
 
-@registry.architectures.register("spacy.TextCatLowData.v1")
+@registry.architectures("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
     width: int, dropout: Optional[float], nO: Optional[int] = None
 ) -> Model[List[Doc], Floats2d]:
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index c4bd6b0d7..5790af631 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -14,7 +14,7 @@ from ...pipeline.tok2vec import Tok2VecListener
 from ...attrs import intify_attr
 
 
-@registry.architectures.register("spacy.Tok2VecListener.v1")
+@registry.architectures("spacy.Tok2VecListener.v1")
 def tok2vec_listener_v1(width: int, upstream: str = "*"):
     tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
     return tok2vec
@@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model):
     return nO
 
 
-@registry.architectures.register("spacy.HashEmbedCNN.v1")
+@registry.architectures("spacy.HashEmbedCNN.v1")
 def build_hash_embed_cnn_tok2vec(
     *,
     width: int,
@@ -87,7 +87,7 @@ def build_hash_embed_cnn_tok2vec(
     )
 
 
-@registry.architectures.register("spacy.Tok2Vec.v2")
+@registry.architectures("spacy.Tok2Vec.v2")
 def build_Tok2Vec_model(
     embed: Model[List[Doc], List[Floats2d]],
     encode: Model[List[Floats2d], List[Floats2d]],
@@ -108,7 +108,7 @@ def build_Tok2Vec_model(
     return tok2vec
 
 
-@registry.architectures.register("spacy.MultiHashEmbed.v1")
+@registry.architectures("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(
     width: int,
     attrs: List[Union[str, int]],
@@ -182,7 +182,7 @@ def MultiHashEmbed(
     return model
 
 
-@registry.architectures.register("spacy.CharacterEmbed.v1")
+@registry.architectures("spacy.CharacterEmbed.v1")
 def CharacterEmbed(
     width: int,
     rows: int,
@@ -255,7 +255,7 @@ def CharacterEmbed(
     return model
 
 
-@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
+@registry.architectures("spacy.MaxoutWindowEncoder.v2")
 def MaxoutWindowEncoder(
     width: int, window_size: int, maxout_pieces: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
@@ -287,7 +287,7 @@ def MaxoutWindowEncoder(
     return with_array(model, pad=receptive_field)
 
 
-@registry.architectures.register("spacy.MishWindowEncoder.v2")
+@registry.architectures("spacy.MishWindowEncoder.v2")
 def MishWindowEncoder(
     width: int, window_size: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
@@ -310,7 +310,7 @@ def MishWindowEncoder(
     return with_array(model)
 
 
-@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
+@registry.architectures("spacy.TorchBiLSTMEncoder.v1")
 def BiLSTMEncoder(
     width: int, depth: int, dropout: float
 ) -> Model[List[Floats2d], List[Floats2d]]:
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 348298e06..4883cceb8 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -230,7 +230,7 @@ def test_el_pipe_configuration(nlp):
     def get_lowercased_candidates(kb, span):
         return kb.get_alias_candidates(span.text.lower())
 
-    @registry.misc.register("spacy.LowercaseCandidateGenerator.v1")
+    @registry.misc("spacy.LowercaseCandidateGenerator.v1")
     def create_candidates() -> Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]:
         return get_lowercased_candidates
 
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 6709defb8..86f726c43 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -160,7 +160,7 @@ subword_features = false
 """
 
 
-@registry.architectures.register("my_test_parser")
+@registry.architectures("my_test_parser")
 def my_parser():
     tok2vec = build_Tok2Vec_model(
         MultiHashEmbed(
diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py
index 352c335ea..fb04d31a3 100644
--- a/spacy/tests/serialize/test_serialize_kb.py
+++ b/spacy/tests/serialize/test_serialize_kb.py
@@ -108,7 +108,7 @@ def test_serialize_subclassed_kb():
             super().__init__(vocab, entity_vector_length)
             self.custom_field = custom_field
 
-    @registry.misc.register("spacy.CustomKB.v1")
+    @registry.misc("spacy.CustomKB.v1")
     def custom_kb(
         entity_vector_length: int, custom_field: int
     ) -> Callable[["Vocab"], KnowledgeBase]:
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
index 31b2a2d2f..c9e451471 100644
--- a/spacy/tests/test_architectures.py
+++ b/spacy/tests/test_architectures.py
@@ -4,12 +4,12 @@ from thinc.api import Linear
 from catalogue import RegistryError
 
 
-@registry.architectures.register("my_test_function")
-def create_model(nr_in, nr_out):
-    return Linear(nr_in, nr_out)
-
-
 def test_get_architecture():
+
+    @registry.architectures("my_test_function")
+    def create_model(nr_in, nr_out):
+        return Linear(nr_in, nr_out)
+
     arch = registry.architectures.get("my_test_function")
     assert arch is create_model
     with pytest.raises(RegistryError):
diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py
index 1c698abcc..f53660818 100644
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@@ -27,7 +27,7 @@ def test_readers():
     factory = "textcat"
     """
 
-    @registry.readers.register("myreader.v1")
+    @registry.readers("myreader.v1")
     def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
         annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
 
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index d7a7d3ce8..0bc935d51 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -15,7 +15,7 @@ next: /usage/projects
 > ```python
 > from thinc.api import Model, chain
 >
-> @spacy.registry.architectures.register("model.v1")
+> @spacy.registry.architectures("model.v1")
 > def build_model(width: int, classes: int) -> Model:
 >     tok2vec = build_tok2vec(width)
 >     output_layer = build_output_layer(width, classes)
@@ -563,7 +563,7 @@ matrix** (~~Floats2d~~) of predictions:
 
 ```python
 ### The model architecture
-@spacy.registry.architectures.register("rel_model.v1")
+@spacy.registry.architectures("rel_model.v1")
 def create_relation_model(...) -> Model[List[Doc], Floats2d]:
     model = ...  # 👈 model will go here
     return model
@@ -589,7 +589,7 @@ transforms the instance tensor into a final tensor holding the predictions:
 
 ```python
 ### The model architecture {highlight="6"}
-@spacy.registry.architectures.register("rel_model.v1")
+@spacy.registry.architectures("rel_model.v1")
 def create_relation_model(
     create_instance_tensor: Model[List[Doc], Floats2d],
     classification_layer: Model[Floats2d, Floats2d],
@@ -613,7 +613,7 @@ The `classification_layer` could be something like a
 
 ```python
 ### The classification layer
-@spacy.registry.architectures.register("rel_classification_layer.v1")
+@spacy.registry.architectures("rel_classification_layer.v1")
 def create_classification_layer(
     nO: int = None, nI: int = None
 ) -> Model[Floats2d, Floats2d]:
@@ -650,7 +650,7 @@ that has the full implementation.
 
 ```python
 ### The layer that creates the instance tensor
-@spacy.registry.architectures.register("rel_instance_tensor.v1")
+@spacy.registry.architectures("rel_instance_tensor.v1")
 def create_tensors(
     tok2vec: Model[List[Doc], List[Floats2d]],
     pooling: Model[Ragged, Floats2d],
@@ -731,7 +731,7 @@ are within a **maximum distance** (in number of tokens) of each other:
 
 ```python
 ### Candidate generation
-@spacy.registry.misc.register("rel_instance_generator.v1")
+@spacy.registry.misc("rel_instance_generator.v1")
 def create_instances(max_length: int) -> Callable[[Doc], List[Tuple[Span, Span]]]:
     def get_candidates(doc: "Doc") -> List[Tuple[Span, Span]]:
         candidates = []

From 682a6232e3af6f01602b6a0c4802ac3e0ed4655a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Mar 2021 17:59:13 +0100
Subject: [PATCH 010/146] fix typo

---
 website/docs/api/architectures.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index d8f0ce022..1739836ed 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -19,7 +19,7 @@ spaCy's built-in architectures that are used for different NLP tasks. All
 trainable [built-in components](/api#architecture-pipeline) expect a `model`
 argument defined in the config and document their the default architecture.
 Custom architectures can be registered using the
-[`@spacy.registry.architectures`](/api/top-level#regsitry) decorator and used as
+[`@spacy.registry.architectures`](/api/top-level#registry) decorator and used as
 part of the [training config](/usage/training#custom-functions). Also see the
 usage documentation on
 [layers and model architectures](/usage/layers-architectures).

From 1b0d413e45e1495d58a6756f4f1dad82953a9bb6 Mon Sep 17 00:00:00 2001
From: vincent d warmerdam <vincentwarmerdam@gmail.com>
Date: Fri, 5 Mar 2021 14:31:15 +0100
Subject: [PATCH 011/146] Removed Languages that were listed twice on Docs
 (#7272)

* removed languages that were listed twice

* sorted

* d0h

* the d0h strikes back when you dont hit save
---
 website/meta/languages.json | 478 ++++++++++++++++++++++++++++--------
 1 file changed, 369 insertions(+), 109 deletions(-)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index 579dca9fe..e05718047 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -1,85 +1,201 @@
 {
     "languages": [
-        { "code": "af", "name": "Afrikaans" },
-        { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
-        { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
-        { "code": "bn", "name": "Bengali", "has_examples": true },
-        { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
-        { "code": "cs", "name": "Czech", "has_examples": true },
+        {
+            "code": "af",
+            "name": "Afrikaans"
+        },
+        {
+            "code": "ar",
+            "name": "Arabic",
+            "example": "هذه جملة",
+            "has_examples": true
+        },
+        {
+            "code": "bg",
+            "name": "Bulgarian",
+            "example": "Това е изречение",
+            "has_examples": true
+        },
+        {
+            "code": "bn",
+            "name": "Bengali",
+            "has_examples": true
+        },
+        {
+            "code": "ca",
+            "name": "Catalan",
+            "example": "Això és una frase.",
+            "has_examples": true
+        },
+        {
+            "code": "cs",
+            "name": "Czech",
+            "has_examples": true
+        },
         {
             "code": "da",
             "name": "Danish",
             "example": "Dette er en sætning.",
             "has_examples": true,
-            "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
+            "models": [
+                "da_core_news_sm",
+                "da_core_news_md",
+                "da_core_news_lg"
+            ]
         },
         {
             "code": "de",
             "name": "German",
-            "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
+            "models": [
+                "de_core_news_sm",
+                "de_core_news_md",
+                "de_core_news_lg",
+                "de_dep_news_trf"
+            ],
             "example": "Dies ist ein Satz.",
             "has_examples": true
         },
         {
             "code": "el",
             "name": "Greek",
-            "models": ["el_core_news_sm", "el_core_news_md", "el_core_news_lg"],
+            "models": [
+                "el_core_news_sm",
+                "el_core_news_md",
+                "el_core_news_lg"
+            ],
             "example": "Αυτή είναι μια πρόταση.",
             "has_examples": true
         },
         {
             "code": "en",
             "name": "English",
-            "models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
+            "models": [
+                "en_core_web_sm",
+                "en_core_web_md",
+                "en_core_web_lg",
+                "en_core_web_trf"
+            ],
             "example": "This is a sentence.",
             "has_examples": true
         },
         {
             "code": "es",
             "name": "Spanish",
-            "models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
+            "models": [
+                "es_core_news_sm",
+                "es_core_news_md",
+                "es_core_news_lg",
+                "es_dep_news_trf"
+            ],
             "example": "Esto es una frase.",
             "has_examples": true
         },
-        { "code": "et", "name": "Estonian" },
-        { "code": "eu", "name": "Basque", "has_examples": true },
-        { "code": "fa", "name": "Persian", "has_examples": true },
-        { "code": "fi", "name": "Finnish", "has_examples": true },
+        {
+            "code": "et",
+            "name": "Estonian"
+        },
+        {
+            "code": "eu",
+            "name": "Basque",
+            "has_examples": true
+        },
+        {
+            "code": "fa",
+            "name": "Persian",
+            "has_examples": true
+        },
+        {
+            "code": "fi",
+            "name": "Finnish",
+            "has_examples": true
+        },
         {
             "code": "fr",
             "name": "French",
-            "models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
+            "models": [
+                "fr_core_news_sm",
+                "fr_core_news_md",
+                "fr_core_news_lg",
+                "fr_dep_news_trf"
+            ],
             "example": "C'est une phrase.",
             "has_examples": true
         },
-        { "code": "ga", "name": "Irish" },
-        { "code": "gu", "name": "Gujarati", "has_examples": true },
-        { "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
-        { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
-        { "code": "hr", "name": "Croatian", "has_examples": true },
-        { "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
-        { "code": "hy", "name": "Armenian", "has_examples": true },
+        {
+            "code": "ga",
+            "name": "Irish"
+        },
+        {
+            "code": "gu",
+            "name": "Gujarati",
+            "has_examples": true
+        },
+        {
+            "code": "he",
+            "name": "Hebrew",
+            "example": "זהו משפט.",
+            "has_examples": true
+        },
+        {
+            "code": "hi",
+            "name": "Hindi",
+            "example": "यह एक वाक्य है।",
+            "has_examples": true
+        },
+        {
+            "code": "hr",
+            "name": "Croatian",
+            "has_examples": true
+        },
+        {
+            "code": "hu",
+            "name": "Hungarian",
+            "example": "Ez egy mondat.",
+            "has_examples": true
+        },
+        {
+            "code": "hy",
+            "name": "Armenian",
+            "has_examples": true
+        },
         {
             "code": "id",
             "name": "Indonesian",
             "example": "Ini adalah sebuah kalimat.",
             "has_examples": true
         },
-        { "code": "is", "name": "Icelandic" },
+        {
+            "code": "is",
+            "name": "Icelandic"
+        },
         {
             "code": "it",
             "name": "Italian",
-            "models": ["it_core_news_sm", "it_core_news_md", "it_core_news_lg"],
+            "models": [
+                "it_core_news_sm",
+                "it_core_news_md",
+                "it_core_news_lg"
+            ],
             "example": "Questa è una frase.",
             "has_examples": true
         },
         {
             "code": "ja",
             "name": "Japanese",
-            "models": ["ja_core_news_sm", "ja_core_news_md", "ja_core_news_lg"],
+            "models": [
+                "ja_core_news_sm",
+                "ja_core_news_md",
+                "ja_core_news_lg"
+            ],
             "dependencies": [
-                { "name": "Unidic", "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj" },
-                { "name": "Mecab", "url": "https://github.com/taku910/mecab" },
+                {
+                    "name": "Unidic",
+                    "url": "http://unidic.ninjal.ac.jp/back_number#unidic_cwj"
+                },
+                {
+                    "name": "Mecab",
+                    "url": "https://github.com/taku910/mecab"
+                },
                 {
                     "name": "SudachiPy",
                     "url": "https://github.com/WorksApplications/SudachiPy"
@@ -88,7 +204,11 @@
             "example": "これは文章です。",
             "has_examples": true
         },
-        { "code": "kn", "name": "Kannada", "has_examples": true },
+        {
+            "code": "kn",
+            "name": "Kannada",
+            "has_examples": true
+        },
         {
             "code": "ko",
             "name": "Korean",
@@ -97,8 +217,14 @@
                     "name": "mecab-ko",
                     "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
                 },
-                { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
-                { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
+                {
+                    "name": "mecab-ko-dic",
+                    "url": "https://bitbucket.org/eunjeon/mecab-ko-dic"
+                },
+                {
+                    "name": "natto-py",
+                    "url": "https://github.com/buruzaemon/natto-py"
+                }
             ],
             "example": "이것은 문장입니다.",
             "has_examples": true
@@ -109,7 +235,11 @@
             "example": "Адамга эң кыйыны — күн сайын адам болуу",
             "has_examples": true
         },
-        { "code": "lb", "name": "Luxembourgish", "has_examples": true },
+        {
+            "code": "lb",
+            "name": "Luxembourgish",
+            "has_examples": true
+        },
         {
             "code": "lij",
             "name": "Ligurian",
@@ -120,29 +250,53 @@
             "code": "lt",
             "name": "Lithuanian",
             "has_examples": true,
-            "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
+            "models": [
+                "lt_core_news_sm",
+                "lt_core_news_md",
+                "lt_core_news_lg"
+            ]
+        },
+        {
+            "code": "lv",
+            "name": "Latvian"
         },
-        { "code": "lv", "name": "Latvian" },
         {
             "code": "mk",
-            "name": "Macedonian",
-            "has_examples": false,
-            "models": ["mk_core_news_sm", "mk_core_news_md", "mk_core_news_lg"]
+            "name": "Macedonian"
+        },
+        {
+            "code": "ml",
+            "name": "Malayalam",
+            "has_examples": true
+        },
+        {
+            "code": "mr",
+            "name": "Marathi"
         },
-        { "code": "ml", "name": "Malayalam", "has_examples": true },
-        { "code": "mr", "name": "Marathi" },
         {
             "code": "nb",
             "name": "Norwegian Bokmål",
             "example": "Dette er en setning.",
             "has_examples": true,
-            "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
+            "models": [
+                "nb_core_news_sm",
+                "nb_core_news_md",
+                "nb_core_news_lg"
+            ]
+        },
+        {
+            "code": "ne",
+            "name": "Nepali",
+            "has_examples": true
         },
-        { "code": "ne", "name": "Nepali", "has_examples": true },
         {
             "code": "nl",
             "name": "Dutch",
-            "models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
+            "models": [
+                "nl_core_news_sm",
+                "nl_core_news_md",
+                "nl_core_news_lg"
+            ],
             "example": "Dit is een zin.",
             "has_examples": true
         },
@@ -151,12 +305,20 @@
             "name": "Polish",
             "example": "To jest zdanie.",
             "has_examples": true,
-            "models": ["pl_core_news_sm", "pl_core_news_md", "pl_core_news_lg"]
+            "models": [
+                "pl_core_news_sm",
+                "pl_core_news_md",
+                "pl_core_news_lg"
+            ]
         },
         {
             "code": "pt",
             "name": "Portuguese",
-            "models": ["pt_core_news_sm", "pt_core_news_md", "pt_core_news_lg"],
+            "models": [
+                "pt_core_news_sm",
+                "pt_core_news_md",
+                "pt_core_news_lg"
+            ],
             "example": "Esta é uma frase.",
             "has_examples": true
         },
@@ -165,95 +327,157 @@
             "name": "Romanian",
             "example": "Aceasta este o propoziție.",
             "has_examples": true,
-            "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
+            "models": [
+                "ro_core_news_sm",
+                "ro_core_news_md",
+                "ro_core_news_lg"
+            ]
         },
         {
             "code": "ru",
             "name": "Russian",
             "has_examples": true,
-            "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }],
-            "models": ["ru_core_news_sm", "ru_core_news_md", "ru_core_news_lg"]
+            "dependencies": [
+                {
+                    "name": "pymorphy2",
+                    "url": "https://github.com/kmike/pymorphy2"
+                }
+            ],
+            "models": [
+                "ru_core_news_sm",
+                "ru_core_news_md",
+                "ru_core_news_lg"
+            ]
+        },
+        {
+            "code": "sa",
+            "name": "Sanskrit",
+            "has_examples": true
+        },
+        {
+            "code": "si",
+            "name": "Sinhala",
+            "example": "මෙය වාක්‍යයකි.",
+            "has_examples": true
+        },
+        {
+            "code": "sk",
+            "name": "Slovak",
+            "has_examples": true
+        },
+        {
+            "code": "sl",
+            "name": "Slovenian"
         },
-        { "code": "sa", "name": "Sanskrit", "has_examples": true },
-        { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
-        { "code": "sk", "name": "Slovak", "has_examples": true },
-        { "code": "sl", "name": "Slovenian" },
         {
             "code": "sq",
             "name": "Albanian",
             "example": "Kjo është një fjali.",
             "has_examples": true
         },
-        { "code": "sr", "name": "Serbian", "has_examples": true },
-        { "code": "sv", "name": "Swedish", "has_examples": true },
-        { "code": "ta", "name": "Tamil", "has_examples": true },
-        { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
+        {
+            "code": "sr",
+            "name": "Serbian",
+            "has_examples": true
+        },
+        {
+            "code": "sv",
+            "name": "Swedish",
+            "has_examples": true
+        },
+        {
+            "code": "ta",
+            "name": "Tamil",
+            "has_examples": true
+        },
+        {
+            "code": "te",
+            "name": "Telugu",
+            "example": "ఇది ఒక వాక్యం.",
+            "has_examples": true
+        },
         {
             "code": "th",
             "name": "Thai",
             "dependencies": [
-                { "name": "pythainlp", "url": "https://github.com/wannaphongcom/pythainlp" }
+                {
+                    "name": "pythainlp",
+                    "url": "https://github.com/wannaphongcom/pythainlp"
+                }
             ],
             "example": "นี่คือประโยค",
             "has_examples": true
         },
-        { "code": "tl", "name": "Tagalog" },
-        { "code": "tn", "name": "Setswana", "has_examples": true },
-        { "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
-        { "code": "tt", "name": "Tatar", "has_examples": true },
+        {
+            "code": "tl",
+            "name": "Tagalog"
+        },
+        {
+            "code": "tn",
+            "name": "Setswana",
+            "has_examples": true
+        },
+        {
+            "code": "tr",
+            "name": "Turkish",
+            "example": "Bu bir cümledir.",
+            "has_examples": true
+        },
+        {
+            "code": "tt",
+            "name": "Tatar",
+            "has_examples": true
+        },
         {
             "code": "uk",
             "name": "Ukrainian",
             "has_examples": true,
-            "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
+            "dependencies": [
+                {
+                    "name": "pymorphy2",
+                    "url": "https://github.com/kmike/pymorphy2"
+                }
+            ]
+        },
+        {
+            "code": "ur",
+            "name": "Urdu",
+            "example": "یہ ایک جملہ ہے",
+            "has_examples": true
         },
-        { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
         {
             "code": "vi",
             "name": "Vietnamese",
-            "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
-        },
-        {
-            "code": "lij",
-            "name": "Ligurian",
-            "example": "Sta chì a l'é unna fraxe.",
-            "has_examples": true
-        },
-        {
-            "code": "hy",
-            "name": "Armenian",
-            "has_examples": true
-        },
-        {
-            "code": "gu",
-            "name": "Gujarati",
-            "has_examples": true
-        },
-        {
-            "code": "ml",
-            "name": "Malayalam",
-            "has_examples": true
-        },
-        {
-            "code": "ne",
-            "name": "Nepali",
-            "has_examples": true
-        },
-        {
-            "code": "mk",
-            "name": "Macedonian"
+            "dependencies": [
+                {
+                    "name": "Pyvi",
+                    "url": "https://github.com/trungtv/pyvi"
+                }
+            ]
         },
         {
             "code": "xx",
             "name": "Multi-language",
-            "models": ["xx_ent_wiki_sm", "xx_sent_ud_sm"],
+            "models": [
+                "xx_ent_wiki_sm",
+                "xx_sent_ud_sm"
+            ],
             "example": "This is a sentence about Facebook."
         },
-        { "code": "yo", "name": "Yoruba", "has_examples": true },
+        {
+            "code": "yo",
+            "name": "Yoruba",
+            "has_examples": true
+        },
         {
             "code": "zh",
             "name": "Chinese",
-            "models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg", "zh_core_web_trf"],
+            "models": [
+                "zh_core_web_sm",
+                "zh_core_web_md",
+                "zh_core_web_lg",
+                "zh_core_web_trf"
+            ],
             "dependencies": [
                 {
                     "name": "Jieba",
@@ -268,21 +492,57 @@
         }
     ],
     "licenses": [
-        { "id": "CC BY 4.0", "url": "https://creativecommons.org/licenses/by/4.0/" },
-        { "id": "CC BY-SA", "url": "https://creativecommons.org/licenses/by-sa/3.0/" },
-        { "id": "CC BY-SA 3.0", "url": "https://creativecommons.org/licenses/by-sa/3.0/" },
-        { "id": "CC BY-SA 4.0", "url": "https://creativecommons.org/licenses/by-sa/4.0/" },
-        { "id": "CC BY-NC", "url": "https://creativecommons.org/licenses/by-nc/3.0/" },
-        { "id": "CC BY-NC 3.0", "url": "https://creativecommons.org/licenses/by-nc/3.0/" },
-        { "id": "CC BY-NC 4.0", "url": "https://creativecommons.org/licenses/by-nc/4.0/" },
-        { "id": "CC-BY-NC-SA 3.0", "url": "https://creativecommons.org/licenses/by-nc-sa/3.0/" },
-        { "id": "GPL", "url": "https://www.gnu.org/licenses/gpl.html" },
-        { "id": "GPU GPL 3.0", "url": "https://www.gnu.org/licenses/gpl-3.0.en.html" },
-        { "id": "LGPL", "url": "https://www.gnu.org/licenses/lgpl.html" },
-        { "id": "MIT", "url": "https://opensource.org/licenses/MIT" },
+        {
+            "id": "CC BY 4.0",
+            "url": "https://creativecommons.org/licenses/by/4.0/"
+        },
+        {
+            "id": "CC BY-SA",
+            "url": "https://creativecommons.org/licenses/by-sa/3.0/"
+        },
+        {
+            "id": "CC BY-SA 3.0",
+            "url": "https://creativecommons.org/licenses/by-sa/3.0/"
+        },
+        {
+            "id": "CC BY-SA 4.0",
+            "url": "https://creativecommons.org/licenses/by-sa/4.0/"
+        },
+        {
+            "id": "CC BY-NC",
+            "url": "https://creativecommons.org/licenses/by-nc/3.0/"
+        },
+        {
+            "id": "CC BY-NC 3.0",
+            "url": "https://creativecommons.org/licenses/by-nc/3.0/"
+        },
+        {
+            "id": "CC BY-NC 4.0",
+            "url": "https://creativecommons.org/licenses/by-nc/4.0/"
+        },
+        {
+            "id": "CC-BY-NC-SA 3.0",
+            "url": "https://creativecommons.org/licenses/by-nc-sa/3.0/"
+        },
+        {
+            "id": "GPL",
+            "url": "https://www.gnu.org/licenses/gpl.html"
+        },
+        {
+            "id": "GPU GPL 3.0",
+            "url": "https://www.gnu.org/licenses/gpl-3.0.en.html"
+        },
+        {
+            "id": "LGPL",
+            "url": "https://www.gnu.org/licenses/lgpl.html"
+        },
+        {
+            "id": "MIT",
+            "url": "https://opensource.org/licenses/MIT"
+        },
         {
             "id": "LGPL-LR",
             "url": "https://github.com/UniversalDependencies/UD_French-Sequoia/blob/master/LICENSE.txt"
         }
     ]
-}
+}
\ No newline at end of file

From 7d085d5b1c7a8efc017e1fc41735b222e776cf13 Mon Sep 17 00:00:00 2001
From: graue70 <23035329+graue70@users.noreply.github.com>
Date: Fri, 5 Mar 2021 18:30:09 +0100
Subject: [PATCH 012/146] Fix typo in docs

---
 website/docs/usage/processing-pipelines.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index b9824ea04..0058d40dc 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1273,7 +1273,7 @@ loss is calculated and to add evaluation scores to the training output.
 | [`update`](/api/pipe#update)         | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model.                                                                                                                                                                                           |
 | [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
 | [`get_loss`](/api/pipe#get_loss)     | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects.                                                                                                                                                                                                                                                 |
-| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_socre_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
+| [`score`](/api/pipe#score)           | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score.                            |
 
 <Infobox title="Custom trainable components and models" emoji="📖">
 

From dfb23a419ee0410f5ef0ce8ebd8b031cd5790e2d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 6 Mar 2021 17:38:54 +1100
Subject: [PATCH 013/146] =?UTF-8?q?Merge=20branch=20'spacy.io'=C2=A0[ci=20?=
 =?UTF-8?q?skip]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 website/docs/usage/v2-1.md |  2 +-
 website/docs/usage/v2-3.md | 33 ++++++++++++++++-----------------
 website/docs/usage/v2.md   |  2 +-
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md
index 8d310f1a4..500e43803 100644
--- a/website/docs/usage/v2-1.md
+++ b/website/docs/usage/v2-1.md
@@ -180,7 +180,7 @@ entirely **in Markdown**, without having to compromise on easy-to-use custom UI
 components. We're hoping that the Markdown source will make it even easier to
 contribute to the documentation. For more details, check out the
 [styleguide](/styleguide) and
-[source](https://github.com/explosion/spaCy/tree/master/website). While
+[source](https://github.com/explosion/spacy/tree/v2.x/website). While
 converting the pages to Markdown, we've also fixed a bunch of typos, improved
 the existing pages and added some new content:
 
diff --git a/website/docs/usage/v2-3.md b/website/docs/usage/v2-3.md
index b6c4d7dfb..075e1ce81 100644
--- a/website/docs/usage/v2-3.md
+++ b/website/docs/usage/v2-3.md
@@ -161,8 +161,8 @@ debugging your tokenizer configuration.
 
 spaCy's custom warnings have been replaced with native Python
 [`warnings`](https://docs.python.org/3/library/warnings.html). Instead of
-setting `SPACY_WARNING_IGNORE`, use the [`warnings`
-filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter)
+setting `SPACY_WARNING_IGNORE`, use the
+[`warnings` filters](https://docs.python.org/3/library/warnings.html#the-warnings-filter)
 to manage warnings.
 
 ```diff
@@ -176,7 +176,7 @@ import spacy
 #### Normalization tables
 
 The normalization tables have moved from the language data in
-[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang) to the
+[`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang) to the
 package [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
 If you're adding data for a new language, the normalization table should be
 added to `spacy-lookups-data`. See
@@ -190,8 +190,8 @@ lexemes will be added to the vocab automatically, just as in small models
 without vectors.
 
 To see the number of unique vectors and number of words with vectors, see
-`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000`
-unique vectors and `684830` words with vectors:
+`nlp.meta['vectors']`, for example for `en_core_web_md` there are `20000` unique
+vectors and `684830` words with vectors:
 
 ```python
 {
@@ -210,8 +210,8 @@ for orth in nlp.vocab.vectors:
     _ = nlp.vocab[orth]
 ```
 
-If your workflow previously iterated over `nlp.vocab`, a similar alternative
-is to iterate over words with vectors instead:
+If your workflow previously iterated over `nlp.vocab`, a similar alternative is
+to iterate over words with vectors instead:
 
 ```diff
 - lexemes = [w for w in nlp.vocab]
@@ -220,9 +220,9 @@ is to iterate over words with vectors instead:
 
 Be aware that the set of preloaded lexemes in a v2.2 model is not equivalent to
 the set of words with vectors. For English, v2.2 `md/lg` models have 1.3M
-provided lexemes but only 685K words with vectors. The vectors have been
-updated for most languages in v2.2, but the English models contain the same
-vectors for both v2.2 and v2.3.
+provided lexemes but only 685K words with vectors. The vectors have been updated
+for most languages in v2.2, but the English models contain the same vectors for
+both v2.2 and v2.3.
 
 #### Lexeme.is_oov and Token.is_oov
 
@@ -234,8 +234,7 @@ fixed in the next patch release v2.3.1.
 </Infobox>
 
 In v2.3, `Lexeme.is_oov` and `Token.is_oov` are `True` if the lexeme does not
-have a word vector. This is equivalent to `token.orth not in
-nlp.vocab.vectors`.
+have a word vector. This is equivalent to `token.orth not in nlp.vocab.vectors`.
 
 Previously in v2.2, `is_oov` corresponded to whether a lexeme had stored
 probability and cluster features. The probability and cluster features are no
@@ -270,8 +269,8 @@ as part of the model vocab.
 
 To load the probability table into a provided model, first make sure you have
 `spacy-lookups-data` installed. To load the table, remove the empty provided
-`lexeme_prob` table and then access `Lexeme.prob` for any word to load the
-table from `spacy-lookups-data`:
+`lexeme_prob` table and then access `Lexeme.prob` for any word to load the table
+from `spacy-lookups-data`:
 
 ```diff
 + # prerequisite: pip install spacy-lookups-data
@@ -321,9 +320,9 @@ the [train CLI](/api/cli#train), you can use the new `--tag-map-path` option to
 provide in the tag map as a JSON dict.
 
 If you want to export a tag map from a provided model for use with the train
-CLI, you can save it as a JSON dict. To only use string keys as required by
-JSON and to make it easier to read and edit, any internal integer IDs need to
-be converted back to strings:
+CLI, you can save it as a JSON dict. To only use string keys as required by JSON
+and to make it easier to read and edit, any internal integer IDs need to be
+converted back to strings:
 
 ```python
 import spacy
diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md
index aee3c24a6..210565c11 100644
--- a/website/docs/usage/v2.md
+++ b/website/docs/usage/v2.md
@@ -303,7 +303,7 @@ lookup-based lemmatization – and **many new languages**!
 <Infobox>
 
 **API:** [`Language`](/api/language) **Code:**
-[`spacy/lang`](https://github.com/explosion/spaCy/tree/master/spacy/lang)
+[`spacy/lang`](https://github.com/explosion/spacy/tree/v2.x/spacy/lang)
 **Usage:** [Adding languages](/usage/adding-languages)
 
 </Infobox>

From 97bcf2ae3a03cf97fd473062c1793e3d18ef2820 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sat, 6 Mar 2021 08:42:14 +0100
Subject: [PATCH 014/146] Fix patience for identical scores (#7250)

* Fix patience for identical scores

Fix training patience so that the earliest best step is chosen for
identical max scores.

* Restore break, remove print

* Explicitly define best_step for clarity
---
 spacy/training/loop.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index dacd2dba4..55919014b 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -230,7 +230,10 @@ def train_while_improving(
         if is_best_checkpoint is not None:
             losses = {}
         # Stop if no improvement in `patience` updates (if specified)
-        best_score, best_step = max(results)
+        # Negate step value so that the earliest best step is chosen for the
+        # same score, i.e. (1.0, 100) is chosen over (1.0, 200)
+        best_result = max((r_score, -r_step) for r_score, r_step in results)
+        best_step = -best_result[1]
         if patience and (step - best_step) >= patience:
             break
         # Stop if we've exhausted our max steps (if specified)

From cd70c3cb791b0e9a9e6323d8b79a267ba11284c9 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 9 Mar 2021 04:01:13 +0100
Subject: [PATCH 015/146] Fixing pretrain (#7342)

* initialize NLP with train corpus

* add more pretraining tests

* more tests

* function to fetch tok2vec layer for pretraining

* clarify parameter name

* test different objectives

* formatting

* fix check for static vectors when using vectors objective

* clarify docs

* logger statement

* fix init_tok2vec and proc.initialize order

* test training after pretraining

* add init_config tests for pretraining

* pop pretraining block to avoid config validation errors

* custom errors
---
 spacy/errors.py                               |   5 +-
 spacy/language.py                             |  12 +-
 spacy/ml/models/multi_task.py                 |   4 +-
 .../tests/serialize/test_serialize_config.py  |   2 +-
 spacy/tests/test_cli.py                       |  10 +-
 spacy/tests/training/test_pretraining.py      | 345 ++++++++++++++++++
 spacy/training/initialize.py                  |  21 +-
 spacy/training/pretrain.py                    |  42 ++-
 website/docs/api/architectures.md             |   7 +-
 9 files changed, 413 insertions(+), 35 deletions(-)
 create mode 100644 spacy/tests/training/test_pretraining.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 2ebc49e8c..4f61cf098 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -487,7 +487,10 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
-
+    E874 = ("Could not initialize the tok2vec model from component "
+            "'{component}' and layer '{layer}'.")
+    E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
+            "In the config, these are defined by the initialize.vectors setting.")
     E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
             "a list of spans, with each span represented by a tuple (start_char, end_char). "
             "The tuple can be optionally extended with a label and a KB ID.")
diff --git a/spacy/language.py b/spacy/language.py
index 2a9b50bcc..80de94278 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1222,10 +1222,6 @@ class Language:
         init_vocab(
             self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
         )
-        pretrain_cfg = config.get("pretraining")
-        if pretrain_cfg:
-            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
-            init_tok2vec(self, P, I)
         if self.vocab.vectors.data.shape[1] >= 1:
             ops = get_current_ops()
             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
@@ -1244,6 +1240,10 @@ class Language:
                     proc.initialize, p_settings, section="components", name=name
                 )
                 proc.initialize(get_examples, nlp=self, **p_settings)
+        pretrain_cfg = config.get("pretraining")
+        if pretrain_cfg:
+            P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
+            init_tok2vec(self, P, I)
         self._link_components()
         self._optimizer = sgd
         if sgd is not None:
@@ -1592,6 +1592,7 @@ class Language:
         # using the nlp.config with all defaults.
         config = util.copy_config(config)
         orig_pipeline = config.pop("components", {})
+        orig_pretraining = config.pop("pretraining", None)
         config["components"] = {}
         if auto_fill:
             filled = registry.fill(config, validate=validate, schema=ConfigSchema)
@@ -1599,6 +1600,9 @@ class Language:
             filled = config
         filled["components"] = orig_pipeline
         config["components"] = orig_pipeline
+        if orig_pretraining is not None:
+            filled["pretraining"] = orig_pretraining
+            config["pretraining"] = orig_pretraining
         resolved_nlp = registry.resolve(
             filled["nlp"], validate=validate, schema=ConfigSchemaNlp
         )
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 8aa0f3c2b..cbfa59eea 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -21,6 +21,8 @@ def create_pretrain_vectors(
     maxout_pieces: int, hidden_size: int, loss: str
 ) -> Callable[["Vocab", Model], Model]:
     def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        if vocab.vectors.data.shape[1] == 0:
+            raise ValueError(Errors.E875)
         model = build_cloze_multi_task_model(
             vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
         )
@@ -134,7 +136,7 @@ def build_cloze_characters_multi_task_model(
 ) -> Model:
     output_layer = chain(
         list2array(),
-        Maxout(hidden_size, nP=maxout_pieces),
+        Maxout(nO=hidden_size, nP=maxout_pieces),
         LayerNorm(nI=hidden_size),
         MultiSoftmax([256] * nr_char, nI=hidden_size),
     )
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 86f726c43..66b66b744 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -293,7 +293,7 @@ def test_serialize_parser(parser_config_string):
 
 
 def test_config_nlp_roundtrip():
-    """Test that a config prduced by the nlp object passes training config
+    """Test that a config produced by the nlp object passes training config
     validation."""
     nlp = English()
     nlp.add_pipe("entity_ruler")
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index a3834f31a..c36be9c57 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -4,7 +4,7 @@ from spacy.training import docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
 from spacy.lang.nl import Dutch
-from spacy.util import ENV_VARS
+from spacy.util import ENV_VARS, load_model_from_config
 from spacy.cli import info
 from spacy.cli.init_config import init_config, RECOMMENDATIONS
 from spacy.cli._util import validate_project_commands, parse_config_overrides
@@ -397,10 +397,14 @@ def test_parse_cli_overrides():
     "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
 )
 @pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
-def test_init_config(lang, pipeline, optimize):
+@pytest.mark.parametrize("pretraining", [True, False])
+def test_init_config(lang, pipeline, optimize, pretraining):
     # TODO: add more tests and also check for GPU with transformers
-    config = init_config(lang=lang, pipeline=pipeline, optimize=optimize, gpu=False)
+    config = init_config(lang=lang, pipeline=pipeline, optimize=optimize, pretraining=pretraining, gpu=False)
     assert isinstance(config, Config)
+    if pretraining:
+        config["paths"]["raw_text"] = "my_data.jsonl"
+    nlp = load_model_from_config(config, auto_fill=True)
 
 
 def test_model_recommendations():
diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py
new file mode 100644
index 000000000..bd8810a5c
--- /dev/null
+++ b/spacy/tests/training/test_pretraining.py
@@ -0,0 +1,345 @@
+from pathlib import Path
+import numpy as np
+import pytest
+import srsly
+from spacy.vocab import Vocab
+from thinc.api import Config
+
+from ..util import make_tempdir
+from ... import util
+from ...lang.en import English
+from ...training.initialize import init_nlp
+from ...training.loop import train
+from ...training.pretrain import pretrain
+from ...tokens import Doc, DocBin
+from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
+
+pretrain_string_listener = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[pretraining]
+max_epochs = 5
+
+[training]
+max_epochs = 5
+"""
+
+pretrain_string_internal = """
+[nlp]
+lang = "en"
+pipeline = ["tagger"]
+
+[components]
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[pretraining]
+max_epochs = 5
+
+[training]
+max_epochs = 5
+"""
+
+
+pretrain_string_vectors = """
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[pretraining]
+max_epochs = 5
+
+[pretraining.objective]
+@architectures = spacy.PretrainVectors.v1
+maxout_pieces = 3
+hidden_size = 300
+loss = cosine
+
+[training]
+max_epochs = 5
+"""
+
+CHAR_OBJECTIVES = [
+    {},
+    {"@architectures": "spacy.PretrainCharacters.v1"},
+    {
+        "@architectures": "spacy.PretrainCharacters.v1",
+        "maxout_pieces": 5,
+        "hidden_size": 42,
+        "n_characters": 2,
+    },
+]
+
+VECTOR_OBJECTIVES = [
+    {
+        "@architectures": "spacy.PretrainVectors.v1",
+        "maxout_pieces": 3,
+        "hidden_size": 300,
+        "loss": "cosine",
+    },
+    {
+        "@architectures": "spacy.PretrainVectors.v1",
+        "maxout_pieces": 2,
+        "hidden_size": 200,
+        "loss": "L2",
+    },
+]
+
+
+def test_pretraining_default():
+    """Test that pretraining defaults to a character objective"""
+    config = Config().from_str(pretrain_string_internal)
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    assert "PretrainCharacters" in filled["pretraining"]["objective"]["@architectures"]
+
+
+@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
+def test_pretraining_tok2vec_characters(objective):
+    """Test that pretraining works with the character objective"""
+    config = Config().from_str(pretrain_string_listener)
+    config["pretraining"]["objective"] = objective
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    with make_tempdir() as tmp_dir:
+        file_path = write_sample_jsonl(tmp_dir)
+        filled["paths"]["raw_text"] = file_path
+        filled = filled.interpolate()
+        assert filled["pretraining"]["component"] == "tok2vec"
+        pretrain(filled, tmp_dir)
+        assert Path(tmp_dir / "model0.bin").exists()
+        assert Path(tmp_dir / "model4.bin").exists()
+        assert not Path(tmp_dir / "model5.bin").exists()
+
+
+@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
+def test_pretraining_tok2vec_vectors_fail(objective):
+    """Test that pretraining doesn't works with the vectors objective if there are no static vectors"""
+    config = Config().from_str(pretrain_string_listener)
+    config["pretraining"]["objective"] = objective
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    with make_tempdir() as tmp_dir:
+        file_path = write_sample_jsonl(tmp_dir)
+        filled["paths"]["raw_text"] = file_path
+        filled = filled.interpolate()
+        assert filled["initialize"]["vectors"] is None
+        with pytest.raises(ValueError):
+            pretrain(filled, tmp_dir)
+
+
+@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
+def test_pretraining_tok2vec_vectors(objective):
+    """Test that pretraining works with the vectors objective and static vectors defined"""
+    config = Config().from_str(pretrain_string_listener)
+    config["pretraining"]["objective"] = objective
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    with make_tempdir() as tmp_dir:
+        file_path = write_sample_jsonl(tmp_dir)
+        filled["paths"]["raw_text"] = file_path
+        nlp_path = write_vectors_model(tmp_dir)
+        filled["initialize"]["vectors"] = nlp_path
+        filled = filled.interpolate()
+        pretrain(filled, tmp_dir)
+
+
+@pytest.mark.parametrize("config", [pretrain_string_internal, pretrain_string_listener])
+def test_pretraining_tagger_tok2vec(config):
+    """Test pretraining of the tagger's tok2vec layer (via a listener)"""
+    config = Config().from_str(pretrain_string_listener)
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    with make_tempdir() as tmp_dir:
+        file_path = write_sample_jsonl(tmp_dir)
+        filled["paths"]["raw_text"] = file_path
+        filled["pretraining"]["component"] = "tagger"
+        filled["pretraining"]["layer"] = "tok2vec"
+        filled = filled.interpolate()
+        pretrain(filled, tmp_dir)
+        assert Path(tmp_dir / "model0.bin").exists()
+        assert Path(tmp_dir / "model4.bin").exists()
+        assert not Path(tmp_dir / "model5.bin").exists()
+
+
+def test_pretraining_tagger():
+    """Test pretraining of the tagger itself will throw an error (not an appropriate tok2vec layer)"""
+    config = Config().from_str(pretrain_string_internal)
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    with make_tempdir() as tmp_dir:
+        file_path = write_sample_jsonl(tmp_dir)
+        filled["paths"]["raw_text"] = file_path
+        filled["pretraining"]["component"] = "tagger"
+        filled = filled.interpolate()
+        with pytest.raises(ValueError):
+            pretrain(filled, tmp_dir)
+
+
+def test_pretraining_training():
+    """Test that training can use a pretrained Tok2Vec model"""
+    config = Config().from_str(pretrain_string_internal)
+    nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
+    filled = nlp.config
+    pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = pretrain_config.merge(filled)
+    train_config = util.load_config(DEFAULT_CONFIG_PATH)
+    filled = train_config.merge(filled)
+    with make_tempdir() as tmp_dir:
+        pretrain_dir = tmp_dir / "pretrain"
+        pretrain_dir.mkdir()
+        file_path = write_sample_jsonl(pretrain_dir)
+        filled["paths"]["raw_text"] = file_path
+        filled["pretraining"]["component"] = "tagger"
+        filled["pretraining"]["layer"] = "tok2vec"
+        train_dir = tmp_dir / "train"
+        train_dir.mkdir()
+        train_path, dev_path = write_sample_training(train_dir)
+        filled["paths"]["train"] = train_path
+        filled["paths"]["dev"] = dev_path
+        filled = filled.interpolate()
+        P = filled["pretraining"]
+        nlp_base = init_nlp(filled)
+        model_base = nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
+        embed_base = None
+        for node in model_base.walk():
+            if node.name == "hashembed":
+                embed_base = node
+        pretrain(filled, pretrain_dir)
+        pretrained_model = Path(pretrain_dir / "model3.bin")
+        assert pretrained_model.exists()
+        filled["initialize"]["init_tok2vec"] = str(pretrained_model)
+        nlp = init_nlp(filled)
+        model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
+        embed = None
+        for node in model.walk():
+            if node.name == "hashembed":
+                embed = node
+        # ensure that the tok2vec weights are actually changed by the pretraining
+        assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
+        train(nlp, train_dir)
+
+
+def write_sample_jsonl(tmp_dir):
+    data = [
+        {
+            "meta": {"id": "1"},
+            "text": "This is the best TV you'll ever buy!",
+            "cats": {"pos": 1, "neg": 0},
+        },
+        {
+            "meta": {"id": "2"},
+            "text": "I wouldn't buy this again.",
+            "cats": {"pos": 0, "neg": 1},
+        },
+    ]
+    file_path = f"{tmp_dir}/text.jsonl"
+    srsly.write_jsonl(file_path, data)
+    return file_path
+
+
+def write_sample_training(tmp_dir):
+    words = ["The", "players", "start", "."]
+    tags = ["DT", "NN", "VBZ", "."]
+    doc = Doc(English().vocab, words=words, tags=tags)
+    doc_bin = DocBin()
+    doc_bin.add(doc)
+    train_path = f"{tmp_dir}/train.spacy"
+    dev_path = f"{tmp_dir}/dev.spacy"
+    doc_bin.to_disk(train_path)
+    doc_bin.to_disk(dev_path)
+    return train_path, dev_path
+
+
+def write_vectors_model(tmp_dir):
+    import numpy
+    vocab = Vocab()
+    vector_data = {
+        "dog": numpy.random.uniform(-1, 1, (300,)),
+        "cat": numpy.random.uniform(-1, 1, (300,)),
+        "orange": numpy.random.uniform(-1, 1, (300,))
+    }
+    for word, vector in vector_data.items():
+        vocab.set_vector(word, vector)
+    nlp_path = tmp_dir / "vectors_model"
+    nlp = English(vocab)
+    nlp.to_disk(nlp_path)
+    return str(nlp_path)
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 25bb73c78..f7f2f21a4 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -9,6 +9,7 @@ import gzip
 import zipfile
 import tqdm
 
+from .pretrain import get_tok2vec_ref
 from ..lookups import Lookups
 from ..vectors import Vectors
 from ..errors import Errors, Warnings
@@ -147,10 +148,6 @@ def init_tok2vec(
     weights_data = None
     init_tok2vec = ensure_path(I["init_tok2vec"])
     if init_tok2vec is not None:
-        if P["objective"].get("type") == "vectors" and not I["vectors"]:
-            err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
-            errors = [{"loc": ["initialize"], "msg": err}]
-            raise ConfigValidationError(config=nlp.config, errors=errors)
         if not init_tok2vec.exists():
             err = f"can't find pretrained tok2vec: {init_tok2vec}"
             errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
@@ -158,21 +155,9 @@ def init_tok2vec(
         with init_tok2vec.open("rb") as file_:
             weights_data = file_.read()
     if weights_data is not None:
-        tok2vec_component = P["component"]
-        if tok2vec_component is None:
-            desc = (
-                f"To use pretrained tok2vec weights, [pretraining.component] "
-                f"needs to specify the component that should load them."
-            )
-            err = "component can't be null"
-            errors = [{"loc": ["pretraining", "component"], "msg": err}]
-            raise ConfigValidationError(
-                config=nlp.config["pretraining"], errors=errors, desc=desc
-            )
-        layer = nlp.get_pipe(tok2vec_component).model
-        if P["layer"]:
-            layer = layer.get_ref(P["layer"])
+        layer = get_tok2vec_ref(nlp, P)
         layer.from_bytes(weights_data)
+        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
         return True
     return False
 
diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index 152d849e9..c791732db 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -6,9 +6,12 @@ from collections import Counter
 import srsly
 import time
 import re
+
+from thinc.config import ConfigValidationError
 from wasabi import Printer
 
 from .example import Example
+from ..errors import Errors
 from ..tokens import Doc
 from ..schemas import ConfigSchemaPretrain
 from ..util import registry, load_model_from_config, dot_to_object
@@ -133,12 +136,21 @@ def create_pretraining_model(nlp, pretrain_config):
     The actual tok2vec layer is stored as a reference, and only this bit will be
     serialized to file and read back in when calling the 'train' command.
     """
-    nlp.initialize()
-    component = nlp.get_pipe(pretrain_config["component"])
-    if pretrain_config.get("layer"):
-        tok2vec = component.model.get_ref(pretrain_config["layer"])
-    else:
-        tok2vec = component.model
+    with nlp.select_pipes(enable=[]):
+        nlp.initialize()
+    tok2vec = get_tok2vec_ref(nlp, pretrain_config)
+    # If the config referred to a Tok2VecListener, grab the original model instead
+    if type(tok2vec).__name__ == "Tok2VecListener":
+        original_tok2vec = (
+            tok2vec.upstream_name if tok2vec.upstream_name is not "*" else "tok2vec"
+        )
+        tok2vec = nlp.get_pipe(original_tok2vec).model
+    try:
+        tok2vec.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
+    except ValueError:
+        component = pretrain_config["component"]
+        layer = pretrain_config["layer"]
+        raise ValueError(Errors.E874.format(component=component, layer=layer))
 
     create_function = pretrain_config["objective"]
     model = create_function(nlp.vocab, tok2vec)
@@ -147,6 +159,24 @@ def create_pretraining_model(nlp, pretrain_config):
     return model
 
 
+def get_tok2vec_ref(nlp, pretrain_config):
+    tok2vec_component = pretrain_config["component"]
+    if tok2vec_component is None:
+        desc = (
+            f"To use pretrained tok2vec weights, [pretraining.component] "
+            f"needs to specify the component that should load them."
+        )
+        err = "component can't be null"
+        errors = [{"loc": ["pretraining", "component"], "msg": err}]
+        raise ConfigValidationError(
+            config=nlp.config["pretraining"], errors=errors, desc=desc
+        )
+    layer = nlp.get_pipe(tok2vec_component).model
+    if pretrain_config["layer"]:
+        layer = layer.get_ref(pretrain_config["layer"])
+    return layer
+
+
 class ProgressTracker:
     def __init__(self, frequency=1000000):
         self.loss = 0.0
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 1739836ed..793855d18 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -447,6 +447,9 @@ For more information, see the section on
 > ```ini
 > [pretraining]
 > component = "tok2vec"
+> 
+> [initialize]
+> vectors = "en_core_web_lg"
 > ...
 >
 > [pretraining.objective]
@@ -457,7 +460,9 @@ For more information, see the section on
 > ```
 
 Predict the word's vector from a static embeddings table as pretraining
-objective for a Tok2Vec layer.
+objective for a Tok2Vec layer. To use this objective, make sure that the 
+`initialize.vectors` section in the config refers to a model with static 
+vectors.
 
 | Name            | Description                                                                                                                                               |
 | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |

From 3f3e8110dc6ec3c1449b120c29ffc7d5475ef622 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 9 Mar 2021 04:02:32 +0100
Subject: [PATCH 016/146] Fix lowercase augmentation (#7336)

* Fix aborted/skipped augmentation for `spacy.orth_variants.v1` if
lowercasing was enabled for an example
* Simplify `spacy.orth_variants.v1` for `Example` vs. `GoldParse`
* Preserve reference tokenization in `spacy.lower_case.v1`
---
 spacy/tests/training/test_augmenters.py |  61 ++++++++-
 spacy/training/augment.py               | 158 +++++++++---------------
 2 files changed, 114 insertions(+), 105 deletions(-)

diff --git a/spacy/tests/training/test_augmenters.py b/spacy/tests/training/test_augmenters.py
index 0bd4d5ef2..43a78e4b0 100644
--- a/spacy/tests/training/test_augmenters.py
+++ b/spacy/tests/training/test_augmenters.py
@@ -38,19 +38,59 @@ def doc(nlp):
 
 
 @pytest.mark.filterwarnings("ignore::UserWarning")
-def test_make_orth_variants(nlp, doc):
+def test_make_orth_variants(nlp):
     single = [
         {"tags": ["NFP"], "variants": ["…", "..."]},
         {"tags": [":"], "variants": ["-", "—", "–", "--", "---", "——"]},
     ]
+    # fmt: off
+    words = ["\n\n", "A", "\t", "B", "a", "b", "…", "...", "-", "—", "–", "--", "---", "——"]
+    tags = ["_SP", "NN", "\t", "NN", "NN", "NN", "NFP", "NFP", ":", ":", ":", ":", ":", ":"]
+    # fmt: on
+    spaces = [True] * len(words)
+    spaces[0] = False
+    spaces[2] = False
+    doc = Doc(nlp.vocab, words=words, spaces=spaces, tags=tags)
     augmenter = create_orth_variants_augmenter(
         level=0.2, lower=0.5, orth_variants={"single": single}
     )
-    with make_docbin([doc]) as output_file:
+    with make_docbin([doc] * 10) as output_file:
         reader = Corpus(output_file, augmenter=augmenter)
-        # Due to randomness, only test that it works without errors for now
+        # Due to randomness, only test that it works without errors
         list(reader(nlp))
 
+    # check that the following settings lowercase everything
+    augmenter = create_orth_variants_augmenter(
+        level=1.0, lower=1.0, orth_variants={"single": single}
+    )
+    with make_docbin([doc] * 10) as output_file:
+        reader = Corpus(output_file, augmenter=augmenter)
+        for example in reader(nlp):
+            for token in example.reference:
+                assert token.text == token.text.lower()
+
+    # check that lowercasing is applied without tags
+    doc = Doc(nlp.vocab, words=words, spaces=[True] * len(words))
+    augmenter = create_orth_variants_augmenter(
+        level=1.0, lower=1.0, orth_variants={"single": single}
+    )
+    with make_docbin([doc] * 10) as output_file:
+        reader = Corpus(output_file, augmenter=augmenter)
+        for example in reader(nlp):
+            for ex_token, doc_token in zip(example.reference, doc):
+                assert ex_token.text == doc_token.text.lower()
+
+    # check that no lowercasing is applied with lower=0.0
+    doc = Doc(nlp.vocab, words=words, spaces=[True] * len(words))
+    augmenter = create_orth_variants_augmenter(
+        level=1.0, lower=0.0, orth_variants={"single": single}
+    )
+    with make_docbin([doc] * 10) as output_file:
+        reader = Corpus(output_file, augmenter=augmenter)
+        for example in reader(nlp):
+            for ex_token, doc_token in zip(example.reference, doc):
+                assert ex_token.text == doc_token.text
+
 
 def test_lowercase_augmenter(nlp, doc):
     augmenter = create_lower_casing_augmenter(level=1.0)
@@ -66,6 +106,21 @@ def test_lowercase_augmenter(nlp, doc):
         assert ref_ent.text == orig_ent.text.lower()
     assert [t.pos_ for t in eg.reference] == [t.pos_ for t in doc]
 
+    # check that augmentation works when lowercasing leads to different
+    # predicted tokenization
+    words = ["A", "B", "CCC."]
+    doc = Doc(nlp.vocab, words=words)
+    with make_docbin([doc]) as output_file:
+        reader = Corpus(output_file, augmenter=augmenter)
+        corpus = list(reader(nlp))
+    eg = corpus[0]
+    assert eg.reference.text == doc.text.lower()
+    assert eg.predicted.text == doc.text.lower()
+    assert [t.text for t in eg.reference] == [t.lower() for t in words]
+    assert [t.text for t in eg.predicted] == [
+        t.text for t in nlp.make_doc(doc.text.lower())
+    ]
+
 
 @pytest.mark.filterwarnings("ignore::UserWarning")
 def test_custom_data_augmentation(nlp, doc):
diff --git a/spacy/training/augment.py b/spacy/training/augment.py
index 13ae45bd2..0dae92143 100644
--- a/spacy/training/augment.py
+++ b/spacy/training/augment.py
@@ -1,12 +1,10 @@
 from typing import Callable, Iterator, Dict, List, Tuple, TYPE_CHECKING
 import random
 import itertools
-import copy
 from functools import partial
 from pydantic import BaseModel, StrictStr
 
 from ..util import registry
-from ..tokens import Doc
 from .example import Example
 
 if TYPE_CHECKING:
@@ -71,7 +69,7 @@ def lower_casing_augmenter(
     else:
         example_dict = example.to_dict()
         doc = nlp.make_doc(example.text.lower())
-        example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in doc]
+        example_dict["token_annotation"]["ORTH"] = [t.lower_ for t in example.reference]
         yield example.from_dict(doc, example_dict)
 
 
@@ -88,24 +86,15 @@ def orth_variants_augmenter(
     else:
         raw_text = example.text
         orig_dict = example.to_dict()
-        if not orig_dict["token_annotation"]:
-            yield example
-        else:
-            variant_text, variant_token_annot = make_orth_variants(
-                nlp,
-                raw_text,
-                orig_dict["token_annotation"],
-                orth_variants,
-                lower=raw_text is not None and random.random() < lower,
-            )
-            if variant_text:
-                doc = nlp.make_doc(variant_text)
-            else:
-                doc = Doc(nlp.vocab, words=variant_token_annot["ORTH"])
-                variant_token_annot["ORTH"] = [w.text for w in doc]
-                variant_token_annot["SPACY"] = [w.whitespace_ for w in doc]
-            orig_dict["token_annotation"] = variant_token_annot
-            yield example.from_dict(doc, orig_dict)
+        variant_text, variant_token_annot = make_orth_variants(
+            nlp,
+            raw_text,
+            orig_dict["token_annotation"],
+            orth_variants,
+            lower=raw_text is not None and random.random() < lower,
+        )
+        orig_dict["token_annotation"] = variant_token_annot
+        yield example.from_dict(nlp.make_doc(variant_text), orig_dict)
 
 
 def make_orth_variants(
@@ -116,88 +105,53 @@ def make_orth_variants(
     *,
     lower: bool = False,
 ) -> Tuple[str, Dict[str, List[str]]]:
-    orig_token_dict = copy.deepcopy(token_dict)
-    ndsv = orth_variants.get("single", [])
-    ndpv = orth_variants.get("paired", [])
     words = token_dict.get("ORTH", [])
     tags = token_dict.get("TAG", [])
-    # keep unmodified if words or tags are not defined
-    if words and tags:
-        if lower:
-            words = [w.lower() for w in words]
-        # single variants
-        punct_choices = [random.choice(x["variants"]) for x in ndsv]
-        for word_idx in range(len(words)):
-            for punct_idx in range(len(ndsv)):
-                if (
-                    tags[word_idx] in ndsv[punct_idx]["tags"]
-                    and words[word_idx] in ndsv[punct_idx]["variants"]
-                ):
-                    words[word_idx] = punct_choices[punct_idx]
-        # paired variants
-        punct_choices = [random.choice(x["variants"]) for x in ndpv]
-        for word_idx in range(len(words)):
-            for punct_idx in range(len(ndpv)):
-                if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
-                    word_idx
-                ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
-                    # backup option: random left vs. right from pair
-                    pair_idx = random.choice([0, 1])
-                    # best option: rely on paired POS tags like `` / ''
-                    if len(ndpv[punct_idx]["tags"]) == 2:
-                        pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
-                    # next best option: rely on position in variants
-                    # (may not be unambiguous, so order of variants matters)
-                    else:
-                        for pair in ndpv[punct_idx]["variants"]:
-                            if words[word_idx] in pair:
-                                pair_idx = pair.index(words[word_idx])
-                    words[word_idx] = punct_choices[punct_idx][pair_idx]
+    # keep unmodified if words are not defined
+    if not words:
+        return raw, token_dict
+    if lower:
+        words = [w.lower() for w in words]
+        raw = raw.lower()
+    # if no tags, only lowercase
+    if not tags:
         token_dict["ORTH"] = words
-        token_dict["TAG"] = tags
-    # modify raw
-    if raw is not None:
-        variants = []
-        for single_variants in ndsv:
-            variants.extend(single_variants["variants"])
-        for paired_variants in ndpv:
-            variants.extend(
-                list(itertools.chain.from_iterable(paired_variants["variants"]))
-            )
-        # store variants in reverse length order to be able to prioritize
-        # longer matches (e.g., "---" before "--")
-        variants = sorted(variants, key=lambda x: len(x))
-        variants.reverse()
-        variant_raw = ""
-        raw_idx = 0
-        # add initial whitespace
-        while raw_idx < len(raw) and raw[raw_idx].isspace():
-            variant_raw += raw[raw_idx]
-            raw_idx += 1
-        for word in words:
-            match_found = False
-            # skip whitespace words
-            if word.isspace():
-                match_found = True
-            # add identical word
-            elif word not in variants and raw[raw_idx:].startswith(word):
-                variant_raw += word
-                raw_idx += len(word)
-                match_found = True
-            # add variant word
-            else:
-                for variant in variants:
-                    if not match_found and raw[raw_idx:].startswith(variant):
-                        raw_idx += len(variant)
-                        variant_raw += word
-                        match_found = True
-            # something went wrong, abort
-            # (add a warning message?)
-            if not match_found:
-                return raw, orig_token_dict
-            # add following whitespace
-            while raw_idx < len(raw) and raw[raw_idx].isspace():
-                variant_raw += raw[raw_idx]
-                raw_idx += 1
-        raw = variant_raw
+        return raw, token_dict
+    # single variants
+    ndsv = orth_variants.get("single", [])
+    punct_choices = [random.choice(x["variants"]) for x in ndsv]
+    for word_idx in range(len(words)):
+        for punct_idx in range(len(ndsv)):
+            if (
+                tags[word_idx] in ndsv[punct_idx]["tags"]
+                and words[word_idx] in ndsv[punct_idx]["variants"]
+            ):
+                words[word_idx] = punct_choices[punct_idx]
+    # paired variants
+    ndpv = orth_variants.get("paired", [])
+    punct_choices = [random.choice(x["variants"]) for x in ndpv]
+    for word_idx in range(len(words)):
+        for punct_idx in range(len(ndpv)):
+            if tags[word_idx] in ndpv[punct_idx]["tags"] and words[
+                word_idx
+            ] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
+                # backup option: random left vs. right from pair
+                pair_idx = random.choice([0, 1])
+                # best option: rely on paired POS tags like `` / ''
+                if len(ndpv[punct_idx]["tags"]) == 2:
+                    pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
+                # next best option: rely on position in variants
+                # (may not be unambiguous, so order of variants matters)
+                else:
+                    for pair in ndpv[punct_idx]["variants"]:
+                        if words[word_idx] in pair:
+                            pair_idx = pair.index(words[word_idx])
+                words[word_idx] = punct_choices[punct_idx][pair_idx]
+    token_dict["ORTH"] = words
+    # construct modified raw text from words and spaces
+    raw = ""
+    for orth, spacy in zip(token_dict["ORTH"], token_dict["SPACY"]):
+        raw += orth
+        if spacy:
+            raw += " "
     return raw, token_dict

From f26b61e0015a144c5b97d5087c9cb7486f55dbfc Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 9 Mar 2021 10:49:53 +0100
Subject: [PATCH 017/146] Make sure sorted

---
 spacy/pipeline/entityruler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 25bc3abee..4e61dbca7 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -195,7 +195,7 @@ class EntityRuler(Pipe):
                 all_labels.add(label)
             else:
                 all_labels.add(l)
-        return tuple(all_labels)
+        return tuple(sorted(all_labels))
 
     def initialize(
         self,

From 0e1d579f0c65dbcbde05e3430e64c299d58205df Mon Sep 17 00:00:00 2001
From: Jan Krepl <jankrepl@yahoo.com>
Date: Tue, 9 Mar 2021 10:57:32 +0100
Subject: [PATCH 018/146] Add agreement

---
 .github/contributors/jankrepl.md | 106 +++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/jankrepl.md

diff --git a/.github/contributors/jankrepl.md b/.github/contributors/jankrepl.md
new file mode 100644
index 000000000..eda5a29b8
--- /dev/null
+++ b/.github/contributors/jankrepl.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Jan Krepl            |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-03-09           |
+| GitHub username                | jankrepl             |
+| Website (optional)             |                      |

From 39de3602e0321eb2dbcfce032a8d4734162ee69d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 9 Mar 2021 13:01:31 +0100
Subject: [PATCH 019/146] return custom error in nlp.initialize (#7104)

* return custom error in nlp.initialize

* Rename error

Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/errors.py   | 4 ++++
 spacy/language.py | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4f61cf098..e50a658d8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -497,6 +497,10 @@ class Errors:
     E880 = ("The 'wandb' library could not be found - did you install it? "
             "Alternatively, specify the 'ConsoleLogger' in the 'training.logger' "
             "config section, instead of the 'WandbLogger'.")
+    E884 = ("The pipeline could not be initialized because the vectors "
+            "could not be found at '{vectors}'. If your pipeline was already "
+            "initialized/trained before, call 'resume_training' instead of 'initialize', "
+            "or initialize only the components that are new.")
     E885 = ("entity_linker.set_kb received an invalid 'kb_loader' argument: expected "
             "a callable function, but got: {arg_type}")
     E886 = ("Can't replace {name} -> {tok2vec} listeners: path '{path}' not "
diff --git a/spacy/language.py b/spacy/language.py
index 80de94278..5741ef97c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1219,9 +1219,12 @@ class Language:
         before_init = I["before_init"]
         if before_init is not None:
             before_init(self)
-        init_vocab(
-            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
-        )
+        try:
+            init_vocab(
+                self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
+            )
+        except IOError:
+            raise IOError(Errors.E884.format(vectors=I["vectors"]))
         if self.vocab.vectors.data.shape[1] >= 1:
             ops = get_current_ops()
             self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)

From 932887b950751020d3fb4b7f83a5a27b5512faf1 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 9 Mar 2021 13:04:22 +0100
Subject: [PATCH 020/146] textcat scoring fix and multi_label docs (#6974)

* add multi-label textcat to menu

* add infobox on textcat API

* add info to v3 migration guide

* small edits

* further fixes in doc strings

* add infobox to textcat architectures

* add textcat_multilabel to overview of built-in components

* spelling

* fix unrelated warn msg

* Add textcat_multilabel to quickstart [ci skip]

* remove separate documentation page for multilabel_textcategorizer

* small edits

* positive label clarification

* avoid duplicating information in self.cfg and fix textcat.score

* fix multilabel textcat too

* revert threshold to storage in cfg

* revert threshold stuff for multi-textcat

Co-authored-by: Ines Montani <ines@ines.io>
---
 spacy/cli/download.py                         |   2 +-
 spacy/pipeline/textcat.py                     |  16 +-
 spacy/pipeline/textcat_multilabel.py          |  19 +-
 spacy/tests/pipeline/test_textcat.py          |  48 ++
 website/docs/api/architectures.md             |  11 +
 .../docs/api/multilabel_textcategorizer.md    | 453 ------------------
 website/docs/api/textcategorizer.md           |  66 ++-
 website/docs/usage/processing-pipelines.md    |  37 +-
 website/docs/usage/v3.md                      |  21 +-
 website/src/widgets/quickstart-training.js    |  38 +-
 10 files changed, 191 insertions(+), 520 deletions(-)
 delete mode 100644 website/docs/api/multilabel_textcategorizer.md

diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index dbda8578a..d09d5147a 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -60,7 +60,7 @@ def download(model: str, direct: bool = False, sdist: bool = False, *pip_args) -
         model_name = model
         if model in OLD_MODEL_SHORTCUTS:
             msg.warn(
-                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please"
+                f"As of spaCy v3.0, shortcuts like '{model}' are deprecated. Please "
                 f"use the full pipeline package name '{OLD_MODEL_SHORTCUTS[model]}' instead."
             )
             model_name = OLD_MODEL_SHORTCUTS[model]
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index f94bde84f..174ffd273 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -88,11 +88,9 @@ subword_features = true
 def make_textcat(
     nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
-    """Create a TextCategorizer compoment. The text categorizer predicts categories
-    over a whole document. It can learn one or more labels, and the labels can
-    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
-    (i.e. zero or more labels may be true per doc). The multi-label setting is
-    controlled by the model instance that's provided.
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels are considered
+    to be mutually exclusive (i.e. one true label per doc).
 
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
         scores for each category.
@@ -317,9 +315,11 @@ class TextCategorizer(TrainablePipe):
         get_examples (Callable[[], Iterable[Example]]): Function that
             returns a representative sample of gold-standard Example objects.
         nlp (Language): The current nlp object the component is part of.
-        labels: The labels to add to the component, typically generated by the
+        labels (Optional[Iterable[str]]): The labels to add to the component, typically generated by the
             `init labels` command. If no labels are provided, the get_examples
             callback is used to extract the labels from the data.
+        positive_label (Optional[str]): The positive label for a binary task with exclusive classes,
+            `None` otherwise and by default.
 
         DOCS: https://spacy.io/api/textcategorizer#initialize
         """
@@ -358,13 +358,13 @@ class TextCategorizer(TrainablePipe):
         """
         validate_examples(examples, "TextCategorizer.score")
         self._validate_categories(examples)
+        kwargs.setdefault("threshold", self.cfg["threshold"])
+        kwargs.setdefault("positive_label", self.cfg["positive_label"])
         return Scorer.score_cats(
             examples,
             "cats",
             labels=self.labels,
             multi_label=False,
-            positive_label=self.cfg["positive_label"],
-            threshold=self.cfg["threshold"],
             **kwargs,
         )
 
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index dc4b17940..036bc8dc5 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -88,11 +88,10 @@ subword_features = true
 def make_multilabel_textcat(
     nlp: Language, name: str, model: Model[List[Doc], List[Floats2d]], threshold: float
 ) -> "TextCategorizer":
-    """Create a TextCategorizer compoment. The text categorizer predicts categories
-    over a whole document. It can learn one or more labels, and the labels can
-    be mutually exclusive (i.e. one true label per doc) or non-mutually exclusive
-    (i.e. zero or more labels may be true per doc). The multi-label setting is
-    controlled by the model instance that's provided.
+    """Create a TextCategorizer component. The text categorizer predicts categories
+    over a whole document. It can learn one or more labels, and the labels are considered
+    to be non-mutually exclusive, which means that there can be zero or more labels
+    per doc).
 
     model (Model[List[Doc], List[Floats2d]]): A model instance that predicts
         scores for each category.
@@ -104,7 +103,7 @@ def make_multilabel_textcat(
 class MultiLabel_TextCategorizer(TextCategorizer):
     """Pipeline component for multi-label text classification.
 
-    DOCS: https://spacy.io/api/multilabel_textcategorizer
+    DOCS: https://spacy.io/api/textcategorizer
     """
 
     def __init__(
@@ -123,7 +122,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
             losses during training.
         threshold (float): Cutoff to consider a prediction "positive".
 
-        DOCS: https://spacy.io/api/multilabel_textcategorizer#init
+        DOCS: https://spacy.io/api/textcategorizer#init
         """
         self.vocab = vocab
         self.model = model
@@ -149,7 +148,7 @@ class MultiLabel_TextCategorizer(TextCategorizer):
             `init labels` command. If no labels are provided, the get_examples
             callback is used to extract the labels from the data.
 
-        DOCS: https://spacy.io/api/multilabel_textcategorizer#initialize
+        DOCS: https://spacy.io/api/textcategorizer#initialize
         """
         validate_get_examples(get_examples, "MultiLabel_TextCategorizer.initialize")
         if labels is None:
@@ -173,15 +172,15 @@ class MultiLabel_TextCategorizer(TextCategorizer):
         examples (Iterable[Example]): The examples to score.
         RETURNS (Dict[str, Any]): The scores, produced by Scorer.score_cats.
 
-        DOCS: https://spacy.io/api/multilabel_textcategorizer#score
+        DOCS: https://spacy.io/api/textcategorizer#score
         """
         validate_examples(examples, "MultiLabel_TextCategorizer.score")
+        kwargs.setdefault("threshold", self.cfg["threshold"])
         return Scorer.score_cats(
             examples,
             "cats",
             labels=self.labels,
             multi_label=True,
-            threshold=self.cfg["threshold"],
             **kwargs,
         )
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 2b01a9cc8..61af16eb5 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -370,3 +370,51 @@ def test_textcat_evaluation():
 
     assert scores["cats_micro_p"] == 4 / 5
     assert scores["cats_micro_r"] == 4 / 6
+
+
+def test_textcat_threshold():
+    # Ensure the scorer can be called with a different threshold
+    nlp = English()
+    nlp.add_pipe("textcat")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
+    macro_f = scores["cats_score"]
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0, "positive_label": "POSITIVE"})
+    pos_f = scores["cats_score"]
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+    assert pos_f > macro_f
+
+
+def test_textcat_multi_threshold():
+    # Ensure the scorer can be called with a different threshold
+    nlp = English()
+    nlp.add_pipe("textcat_multilabel")
+
+    train_examples = []
+    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+    nlp.initialize(get_examples=lambda: train_examples)
+
+    # score the model (it's not actually trained but that doesn't matter)
+    scores = nlp.evaluate(train_examples)
+    assert 0 <= scores["cats_score"] <= 1
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 1.0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 0
+
+    scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
+    assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 793855d18..9b099d8e2 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -589,6 +589,17 @@ several different built-in architectures. It is recommended to experiment with
 different architectures and settings to determine what works best on your
 specific data and challenge.
 
+<Infobox title="Single-label vs. multi-label classification" variant="warning">
+
+When the architecture for a text classification challenge contains a setting for
+`exclusive_classes`, it is important to use the correct value for the correct
+pipeline component. The `textcat` component should always be used for
+single-label use-cases where `exclusive_classes = true`, while the
+`textcat_multilabel` should be used for multi-label settings with
+`exclusive_classes = false`.
+
+</Infobox>
+
 ### spacy.TextCatEnsemble.v2 {#TextCatEnsemble}
 
 > #### Example Config
diff --git a/website/docs/api/multilabel_textcategorizer.md b/website/docs/api/multilabel_textcategorizer.md
deleted file mode 100644
index 6e1a627c6..000000000
--- a/website/docs/api/multilabel_textcategorizer.md
+++ /dev/null
@@ -1,453 +0,0 @@
----
-title: Multi-label TextCategorizer
-tag: class
-source: spacy/pipeline/textcat_multilabel.py
-new: 3
-teaser: 'Pipeline component for multi-label text classification'
-api_base_class: /api/pipe
-api_string_name: textcat_multilabel
-api_trainable: true
----
-
-The text categorizer predicts **categories over a whole document**. It 
-learns non-mutually exclusive labels, which means that zero or more labels 
-may be true per document.
-
-## Config and implementation {#config}
-
-The default config is defined by the pipeline component factory and describes
-how the component should be configured. You can override its settings via the
-`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
-[`config.cfg` for training](/usage/training#config). See the
-[model architectures](/api/architectures) documentation for details on the
-architectures and their arguments and hyperparameters.
-
-> #### Example
->
-> ```python
-> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
-> config = {
->    "threshold": 0.5,
->    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
-> }
-> nlp.add_pipe("textcat_multilabel", config=config)
-> ```
-
-| Setting     | Description                                                                                                                                                      |
-| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
-| `model`     | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
-
-```python
-%%GITHUB_SPACY/spacy/pipeline/textcat_multilabel.py
-```
-
-## MultiLabel_TextCategorizer.\_\_init\_\_ {#init tag="method"}
-
-> #### Example
->
-> ```python
-> # Construction via add_pipe with default model
-> textcat = nlp.add_pipe("textcat_multilabel")
->
-> # Construction via add_pipe with custom model
-> config = {"model": {"@architectures": "my_textcat"}}
-> parser = nlp.add_pipe("textcat_multilabel", config=config)
->
-> # Construction from class
-> from spacy.pipeline import MultiLabel_TextCategorizer
-> textcat = MultiLabel_TextCategorizer(nlp.vocab, model, threshold=0.5)
-> ```
-
-Create a new pipeline instance. In your application, you would normally use a
-shortcut for this and instantiate the component using its string name and
-[`nlp.add_pipe`](/api/language#create_pipe).
-
-| Name           | Description                                                                                                                |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
-| `vocab`        | The shared vocabulary. ~~Vocab~~                                                                                           |
-| `model`        | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
-| `name`         | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~                        |
-| _keyword-only_ |                                                                                                                            |
-| `threshold`    | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                             |
-
-## MultiLabel_TextCategorizer.\_\_call\_\_ {#call tag="method"}
-
-Apply the pipe to one document. The document is modified in place, and returned.
-This usually happens under the hood when the `nlp` object is called on a text
-and all pipeline components are applied to the `Doc` in order. Both
-[`__call__`](/api/multilabel_textcategorizer#call) and [`pipe`](/api/multilabel_textcategorizer#pipe)
-delegate to the [`predict`](/api/multilabel_textcategorizer#predict) and
-[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
-
-> #### Example
->
-> ```python
-> doc = nlp("This is a sentence.")
-> textcat = nlp.add_pipe("textcat_multilabel")
-> # This usually happens under the hood
-> processed = textcat(doc)
-> ```
-
-| Name        | Description                      |
-| ----------- | -------------------------------- |
-| `doc`       | The document to process. ~~Doc~~ |
-| **RETURNS** | The processed document. ~~Doc~~  |
-
-## MultiLabel_TextCategorizer.pipe {#pipe tag="method"}
-
-Apply the pipe to a stream of documents. This usually happens under the hood
-when the `nlp` object is called on a text and all pipeline components are
-applied to the `Doc` in order. Both [`__call__`](/api/multilabel_textcategorizer#call) and
-[`pipe`](/api/multilabel_textcategorizer#pipe) delegate to the
-[`predict`](/api/multilabel_textcategorizer#predict) and
-[`set_annotations`](/api/multilabel_textcategorizer#set_annotations) methods.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> for doc in textcat.pipe(docs, batch_size=50):
->     pass
-> ```
-
-| Name           | Description                                                   |
-| -------------- | ------------------------------------------------------------- |
-| `stream`       | A stream of documents. ~~Iterable[Doc]~~                      |
-| _keyword-only_ |                                                               |
-| `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
-| **YIELDS**     | The processed documents in order. ~~Doc~~                     |
-
-## MultiLabel_TextCategorizer.initialize {#initialize tag="method" new="3"}
-
-Initialize the component for training. `get_examples` should be a function that
-returns an iterable of [`Example`](/api/example) objects. The data examples are
-used to **initialize the model** of the component and can either be the full
-training data or a representative sample. Initialization includes validating the
-network,
-[inferring missing shapes](https://thinc.ai/docs/usage-models#validation) and
-setting up the label scheme based on the data. This method is typically called
-by [`Language.initialize`](/api/language#initialize) and lets you customize
-arguments it receives via the
-[`[initialize.components]`](/api/data-formats#config-initialize) block in the
-config.
-
-<Infobox variant="warning" title="Changed in v3.0" id="begin_training">
-
-This method was previously called `begin_training`.
-
-</Infobox>
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> textcat.initialize(lambda: [], nlp=nlp)
-> ```
->
-> ```ini
-> ### config.cfg
-> [initialize.components.textcat_multilabel]
->
-> [initialize.components.textcat_multilabel.labels]
-> @readers = "spacy.read_labels.v1"
-> path = "corpus/labels/textcat.json
-> ```
-
-| Name             | Description                                                                                                                                                                                                                                                                                                                                                                                                |
-| ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `get_examples`   | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~                                                                                                                                                                                                                                                                      |
-| _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
-| `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
-| `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-
-## MultiLabel_TextCategorizer.predict {#predict tag="method"}
-
-Apply the component's model to a batch of [`Doc`](/api/doc) objects without
-modifying them.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> scores = textcat.predict([doc1, doc2])
-> ```
-
-| Name        | Description                                 |
-| ----------- | ------------------------------------------- |
-| `docs`      | The documents to predict. ~~Iterable[Doc]~~ |
-| **RETURNS** | The model's prediction for each document.   |
-
-## MultiLabel_TextCategorizer.set_annotations {#set_annotations tag="method"}
-
-Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> scores = textcat.predict(docs)
-> textcat.set_annotations(docs, scores)
-> ```
-
-| Name     | Description                                               |
-| -------- | --------------------------------------------------------- |
-| `docs`   | The documents to modify. ~~Iterable[Doc]~~                |
-| `scores` | The scores to set, produced by `MultiLabel_TextCategorizer.predict`. |
-
-## MultiLabel_TextCategorizer.update {#update tag="method"}
-
-Learn from a batch of [`Example`](/api/example) objects containing the
-predictions and gold-standard annotations, and update the component's model.
-Delegates to [`predict`](/api/multilabel_textcategorizer#predict) and
-[`get_loss`](/api/multilabel_textcategorizer#get_loss).
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> optimizer = nlp.initialize()
-> losses = textcat.update(examples, sgd=optimizer)
-> ```
-
-| Name              | Description                                                                                                                        |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    |
-| `drop`            | The dropout rate. ~~float~~                                                                                                        |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
-
-## MultiLabel_TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
-
-Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
-current model to make predictions similar to an initial model to try to address
-the "catastrophic forgetting" problem. This feature is experimental.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> optimizer = nlp.resume_training()
-> losses = textcat.rehearse(examples, sgd=optimizer)
-> ```
-
-| Name           | Description                                                                                                              |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
-| _keyword-only_ |                                                                                                                          |
-| `drop`         | The dropout rate. ~~float~~                                                                                              |
-| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
-| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
-| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
-
-## MultiLabel_TextCategorizer.get_loss {#get_loss tag="method"}
-
-Find the loss and gradient of loss for the batch of documents and their
-predicted scores.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat_multilabel")
-> scores = textcat.predict([eg.predicted for eg in examples])
-> loss, d_loss = textcat.get_loss(examples, scores)
-> ```
-
-| Name        | Description                                                                 |
-| ----------- | --------------------------------------------------------------------------- |
-| `examples`  | The batch of examples. ~~Iterable[Example]~~                                |
-| `scores`    | Scores representing the model's predictions.                                |
-| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
-
-## MultiLabel_TextCategorizer.score {#score tag="method" new="3"}
-
-Score a batch of examples.
-
-> #### Example
->
-> ```python
-> scores = textcat.score(examples)
-> ```
-
-| Name             | Description                                                                                                          |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The examples to score. ~~Iterable[Example]~~                                                                         |
-| _keyword-only_   |                                                                                                                      |
-| **RETURNS**      | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
-
-## MultiLabel_TextCategorizer.create_optimizer {#create_optimizer tag="method"}
-
-Create an optimizer for the pipeline component.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> optimizer = textcat.create_optimizer()
-> ```
-
-| Name        | Description                  |
-| ----------- | ---------------------------- |
-| **RETURNS** | The optimizer. ~~Optimizer~~ |
-
-## MultiLabel_TextCategorizer.use_params {#use_params tag="method, contextmanager"}
-
-Modify the pipe's model to use the given parameter values.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> with textcat.use_params(optimizer.averages):
->     textcat.to_disk("/best_model")
-> ```
-
-| Name     | Description                                        |
-| -------- | -------------------------------------------------- |
-| `params` | The parameter values to use in the model. ~~dict~~ |
-
-## MultiLabel_TextCategorizer.add_label {#add_label tag="method"}
-
-Add a new label to the pipe. Raises an error if the output dimension is already
-set, or if the model has already been fully [initialized](#initialize). Note
-that you don't have to call this method if you provide a **representative data
-sample** to the [`initialize`](#initialize) method. In this case, all labels
-found in the sample will be automatically added to the model, and the output
-dimension will be [inferred](/usage/layers-architectures#thinc-shape-inference)
-automatically.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat.add_label("MY_LABEL")
-> ```
-
-| Name        | Description                                                 |
-| ----------- | ----------------------------------------------------------- |
-| `label`     | The label to add. ~~str~~                                   |
-| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
-
-## MultiLabel_TextCategorizer.to_disk {#to_disk tag="method"}
-
-Serialize the pipe to disk.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat.to_disk("/path/to/textcat")
-> ```
-
-| Name           | Description                                                                                                                                |
-| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
-| `path`         | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| _keyword-only_ |                                                                                                                                            |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~                                                |
-
-## MultiLabel_TextCategorizer.from_disk {#from_disk tag="method"}
-
-Load the pipe from disk. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat.from_disk("/path/to/textcat")
-> ```
-
-| Name           | Description                                                                                     |
-| -------------- | ----------------------------------------------------------------------------------------------- |
-| `path`         | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
-| _keyword-only_ |                                                                                                 |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~     |
-| **RETURNS**    | The modified `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~                                      |
-
-## MultiLabel_TextCategorizer.to_bytes {#to_bytes tag="method"}
-
-> #### Example
->
-> ```python
-> textcat = nlp.add_pipe("textcat")
-> textcat_bytes = textcat.to_bytes()
-> ```
-
-Serialize the pipe to a bytestring.
-
-| Name           | Description                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------- |
-| _keyword-only_ |                                                                                             |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS**    | The serialized form of the `MultiLabel_TextCategorizer` object. ~~bytes~~                              |
-
-## MultiLabel_TextCategorizer.from_bytes {#from_bytes tag="method"}
-
-Load the pipe from a bytestring. Modifies the object in place and returns it.
-
-> #### Example
->
-> ```python
-> textcat_bytes = textcat.to_bytes()
-> textcat = nlp.add_pipe("textcat")
-> textcat.from_bytes(textcat_bytes)
-> ```
-
-| Name           | Description                                                                                 |
-| -------------- | ------------------------------------------------------------------------------------------- |
-| `bytes_data`   | The data to load from. ~~bytes~~                                                            |
-| _keyword-only_ |                                                                                             |
-| `exclude`      | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
-| **RETURNS**    | The `MultiLabel_TextCategorizer` object. ~~MultiLabel_TextCategorizer~~                                           |
-
-## MultiLabel_TextCategorizer.labels {#labels tag="property"}
-
-The labels currently added to the component.
-
-> #### Example
->
-> ```python
-> textcat.add_label("MY_LABEL")
-> assert "MY_LABEL" in textcat.labels
-> ```
-
-| Name        | Description                                            |
-| ----------- | ------------------------------------------------------ |
-| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
-
-## MultiLabel_TextCategorizer.label_data {#label_data tag="property" new="3"}
-
-The labels currently added to the component and their internal meta information.
-This is the data generated by [`init labels`](/api/cli#init-labels) and used by
-[`MultiLabel_TextCategorizer.initialize`](/api/multilabel_textcategorizer#initialize) to initialize
-the model with a pre-defined label set.
-
-> #### Example
->
-> ```python
-> labels = textcat.label_data
-> textcat.initialize(lambda: [], nlp=nlp, labels=labels)
-> ```
-
-| Name        | Description                                                |
-| ----------- | ---------------------------------------------------------- |
-| **RETURNS** | The label data added to the component. ~~Tuple[str, ...]~~ |
-
-## Serialization fields {#serialization-fields}
-
-During serialization, spaCy will export several data fields used to restore
-different aspects of the object. If needed, you can exclude them from
-serialization by passing in the string names via the `exclude` argument.
-
-> #### Example
->
-> ```python
-> data = textcat.to_disk("/path", exclude=["vocab"])
-> ```
-
-| Name    | Description                                                    |
-| ------- | -------------------------------------------------------------- |
-| `vocab` | The shared [`Vocab`](/api/vocab).                              |
-| `cfg`   | The config file. You usually don't want to exclude this.       |
-| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index ac0ab4f27..fdd235b85 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -3,15 +3,30 @@ title: TextCategorizer
 tag: class
 source: spacy/pipeline/textcat.py
 new: 2
-teaser: 'Pipeline component for single-label text classification'
+teaser: 'Pipeline component for text classification'
 api_base_class: /api/pipe
 api_string_name: textcat
 api_trainable: true
 ---
 
-The text categorizer predicts **categories over a whole document**. It can learn
-one or more labels, and the labels are mutually exclusive - there is exactly one 
-true label per document. 
+The text categorizer predicts **categories over a whole document**. and comes in
+two flavours: `textcat` and `textcat_multilabel`. When you need to predict
+exactly one true label per document, use the `textcat` which has mutually
+exclusive labels. If you want to perform multi-label classification and predict
+zero, one or more labels per document, use the `textcat_multilabel` component
+instead.
+
+Both components are documented on this page.
+
+<Infobox title="Migration from v2" variant="warning">
+
+In spaCy v2, the `textcat` component could also perform **multi-label
+classification**, and even used this setting by default. Since v3.0, the
+component `textcat_multilabel` should be used for multi-label classification
+instead. The `textcat` component is now used for mutually exclusive classes
+only.
+
+</Infobox>
 
 ## Config and implementation {#config}
 
@@ -22,7 +37,7 @@ how the component should be configured. You can override its settings via the
 [model architectures](/api/architectures) documentation for details on the
 architectures and their arguments and hyperparameters.
 
-> #### Example
+> #### Example (textcat)
 >
 > ```python
 > from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL
@@ -33,6 +48,17 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("textcat", config=config)
 > ```
 
+> #### Example (textcat_multilabel)
+>
+> ```python
+> from spacy.pipeline.textcat_multilabel import DEFAULT_MULTI_TEXTCAT_MODEL
+> config = {
+>    "threshold": 0.5,
+>    "model": DEFAULT_MULTI_TEXTCAT_MODEL,
+> }
+> nlp.add_pipe("textcat_multilabel", config=config)
+> ```
+
 | Setting     | Description                                                                                                                                                      |
 | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `threshold` | Cutoff to consider a prediction "positive", relevant when printing accuracy results. ~~float~~                                                                   |
@@ -48,6 +74,7 @@ architectures and their arguments and hyperparameters.
 >
 > ```python
 > # Construction via add_pipe with default model
+> # Use 'textcat_multilabel' for multi-label classification
 > textcat = nlp.add_pipe("textcat")
 >
 > # Construction via add_pipe with custom model
@@ -55,6 +82,7 @@ architectures and their arguments and hyperparameters.
 > parser = nlp.add_pipe("textcat", config=config)
 >
 > # Construction from class
+> # Use 'MultiLabel_TextCategorizer' for multi-label classification
 > from spacy.pipeline import TextCategorizer
 > textcat = TextCategorizer(nlp.vocab, model, threshold=0.5)
 > ```
@@ -161,7 +189,7 @@ This method was previously called `begin_training`.
 | _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes, None otherwise and by default. ~~Optional[str]~~                                                                                                                                                                                                                                                                                              |
+| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                                                             |
 
 ## TextCategorizer.predict {#predict tag="method"}
 
@@ -212,14 +240,14 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
 > losses = textcat.update(examples, sgd=optimizer)
 > ```
 
-| Name              | Description                                                                                                                        |
-| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                  |
-| _keyword-only_    |                                                                                                                                    |
-| `drop`            | The dropout rate. ~~float~~                                                                                                        |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                      |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~           |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                              |
+| Name           | Description                                                                                                              |
+| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| `examples`     | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                        |
+| _keyword-only_ |                                                                                                                          |
+| `drop`         | The dropout rate. ~~float~~                                                                                              |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~            |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                    |
 
 ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
 
@@ -273,11 +301,11 @@ Score a batch of examples.
 > scores = textcat.score(examples)
 > ```
 
-| Name             | Description                                                                                                          |
-| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
-| `examples`       | The examples to score. ~~Iterable[Example]~~                                                                         |
-| _keyword-only_   |                                                                                                                      |
-| **RETURNS**      | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
+| Name           | Description                                                                                                          |
+| -------------- | -------------------------------------------------------------------------------------------------------------------- |
+| `examples`     | The examples to score. ~~Iterable[Example]~~                                                                         |
+| _keyword-only_ |                                                                                                                      |
+| **RETURNS**    | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
 ## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
 
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 0058d40dc..909a9c7de 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -223,21 +223,22 @@ available pipeline components and component functions.
 > ruler = nlp.add_pipe("entity_ruler")
 > ```
 
-| String name       | Component                                       | Description                                                                               |
-| ----------------- | ----------------------------------------------- | ----------------------------------------------------------------------------------------- |
-| `tagger`          | [`Tagger`](/api/tagger)                         | Assign part-of-speech-tags.                                                               |
-| `parser`          | [`DependencyParser`](/api/dependencyparser)     | Assign dependency labels.                                                                 |
-| `ner`             | [`EntityRecognizer`](/api/entityrecognizer)     | Assign named entities.                                                                    |
-| `entity_linker`   | [`EntityLinker`](/api/entitylinker)             | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
-| `entity_ruler`    | [`EntityRuler`](/api/entityruler)               | Assign named entities based on pattern rules and dictionaries.                            |
-| `textcat`         | [`TextCategorizer`](/api/textcategorizer)       | Assign text categories.                                                                   |
-| `lemmatizer`      | [`Lemmatizer`](/api/lemmatizer)                 | Assign base forms to words.                                                               |
-| `morphologizer`   | [`Morphologizer`](/api/morphologizer)           | Assign morphological features and coarse-grained POS tags.                                |
-| `attribute_ruler` | [`AttributeRuler`](/api/attributeruler)         | Assign token attribute mappings and rule-based exceptions.                                |
-| `senter`          | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries.                                                               |
-| `sentencizer`     | [`Sentencizer`](/api/sentencizer)               | Add rule-based sentence segmentation without the dependency parse.                        |
-| `tok2vec`         | [`Tok2Vec`](/api/tok2vec)                       | Assign token-to-vector embeddings.                                                        |
-| `transformer`     | [`Transformer`](/api/transformer)               | Assign the tokens and outputs of a transformer model.                                     |
+| String name          | Component                                            | Description                                                                               |
+| -------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------- |
+| `tagger`             | [`Tagger`](/api/tagger)                              | Assign part-of-speech-tags.                                                               |
+| `parser`             | [`DependencyParser`](/api/dependencyparser)          | Assign dependency labels.                                                                 |
+| `ner`                | [`EntityRecognizer`](/api/entityrecognizer)          | Assign named entities.                                                                    |
+| `entity_linker`      | [`EntityLinker`](/api/entitylinker)                  | Assign knowledge base IDs to named entities. Should be added after the entity recognizer. |
+| `entity_ruler`       | [`EntityRuler`](/api/entityruler)                    | Assign named entities based on pattern rules and dictionaries.                            |
+| `textcat`            | [`TextCategorizer`](/api/textcategorizer)            | Assign text categories: exactly one category is predicted per document.                   |
+| `textcat_multilabel` | [`MultiLabel_TextCategorizer`](/api/textcategorizer) | Assign text categories in a multi-label setting: zero, one or more labels per document.   |
+| `lemmatizer`         | [`Lemmatizer`](/api/lemmatizer)                      | Assign base forms to words.                                                               |
+| `morphologizer`      | [`Morphologizer`](/api/morphologizer)                | Assign morphological features and coarse-grained POS tags.                                |
+| `attribute_ruler`    | [`AttributeRuler`](/api/attributeruler)              | Assign token attribute mappings and rule-based exceptions.                                |
+| `senter`             | [`SentenceRecognizer`](/api/sentencerecognizer)      | Assign sentence boundaries.                                                               |
+| `sentencizer`        | [`Sentencizer`](/api/sentencizer)                    | Add rule-based sentence segmentation without the dependency parse.                        |
+| `tok2vec`            | [`Tok2Vec`](/api/tok2vec)                            | Assign token-to-vector embeddings.                                                        |
+| `transformer`        | [`Transformer`](/api/transformer)                    | Assign the tokens and outputs of a transformer model.                                     |
 
 ### Disabling, excluding and modifying components {#disabling}
 
@@ -400,8 +401,8 @@ vectors available – otherwise, it won't be able to make the same predictions.
 > ```
 >
 > By default, sourced components will be updated with your data during training.
-> If you want to preserve the component as-is, you can "freeze" it if the pipeline 
-> is not using a shared `Tok2Vec` layer:
+> If you want to preserve the component as-is, you can "freeze" it if the
+> pipeline is not using a shared `Tok2Vec` layer:
 >
 > ```ini
 > [training]
@@ -1244,7 +1245,7 @@ labels = []
 # the argument "model"
 [components.textcat.model]
 @architectures = "spacy.TextCatBOW.v1"
-exclusive_classes = false
+exclusive_classes = true
 ngram_size = 1
 no_output_layer = false
 
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 5353f9ded..21e99ffc2 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -320,14 +320,15 @@ add to your pipeline and customize for your use case:
 > nlp.add_pipe("lemmatizer")
 > ```
 
-| Name                                            | Description                                                                                                                                                                                                             |
-| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`SentenceRecognizer`](/api/sentencerecognizer) | Trainable component for sentence segmentation.                                                                                                                                                                          |
-| [`Morphologizer`](/api/morphologizer)           | Trainable component to predict morphological features.                                                                                                                                                                  |
-| [`Lemmatizer`](/api/lemmatizer)                 | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                           |
-| [`AttributeRuler`](/api/attributeruler)         | Component for setting token attributes using match patterns.                                                                                                                                                            |
-| [`Transformer`](/api/transformer)               | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
-| [`TrainablePipe`](/api/pipe)                    | Base class for trainable pipeline components.                                                                                                                                                                           |
+| Name                                                  | Description                                                                                                                                                                                                             |
+| ----------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [`SentenceRecognizer`](/api/sentencerecognizer)       | Trainable component for sentence segmentation.                                                                                                                                                                          |
+| [`Morphologizer`](/api/morphologizer)                 | Trainable component to predict morphological features.                                                                                                                                                                  |
+| [`Lemmatizer`](/api/lemmatizer)                       | Standalone component for rule-based and lookup lemmatization.                                                                                                                                                           |
+| [`AttributeRuler`](/api/attributeruler)               | Component for setting token attributes using match patterns.                                                                                                                                                            |
+| [`Transformer`](/api/transformer)                     | Component for using [transformer models](/usage/embeddings-transformers) in your pipeline, accessing outputs and aligning tokens. Provided via [`spacy-transformers`](https://github.com/explosion/spacy-transformers). |
+| [`TrainablePipe`](/api/pipe)                          | Base class for trainable pipeline components.                                                                                                                                                                           |
+| [`Multi-label TextCategorizer`](/api/textcategorizer) | Trainable component for multi-label text classification.                                                                                                                                                                |
 
 <Infobox title="Details & Documentation" emoji="📖" list>
 
@@ -592,6 +593,10 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 - Various keyword arguments across functions and methods are now explicitly
   declared as **keyword-only** arguments. Those arguments are documented
   accordingly across the API reference using the <Tag>keyword-only</Tag> tag.
+- The `textcat` pipeline component is now only applicable for classification of
+  mutually exclusives classes - i.e. one predicted class per input sentence or
+  document. To perform multi-label classification, use the new
+  `textcat_multilabel` component instead.
 
 ### Removed or renamed API {#incompat-removed}
 
diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js
index 3d2ab0930..849c80f3d 100644
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@@ -9,6 +9,7 @@ import { htmlToReact } from '../components/util'
 const DEFAULT_LANG = 'en'
 const DEFAULT_HARDWARE = 'cpu'
 const DEFAULT_OPT = 'efficiency'
+const DEFAULT_TEXTCAT_EXCLUSIVE = true
 const COMPONENTS = ['tagger', 'parser', 'ner', 'textcat']
 const COMMENT = `# This is an auto-generated partial config. To use it with 'spacy train'
 # you can run spacy init fill-config to auto-fill all default settings:
@@ -27,6 +28,19 @@ const DATA = [
         options: COMPONENTS.map(id => ({ id, title: id })),
         multiple: true,
     },
+    {
+        id: 'textcat',
+        title: 'Text Classification',
+        multiple: true,
+        options: [
+            {
+                id: 'exclusive',
+                title: 'exclusive categories',
+                checked: DEFAULT_TEXTCAT_EXCLUSIVE,
+                help: 'only one label can apply',
+            },
+        ],
+    },
     {
         id: 'hardware',
         title: 'Hardware',
@@ -49,14 +63,28 @@ const DATA = [
 
 export default function QuickstartTraining({ id, title, download = 'base_config.cfg' }) {
     const [lang, setLang] = useState(DEFAULT_LANG)
+    const [_components, _setComponents] = useState([])
     const [components, setComponents] = useState([])
     const [[hardware], setHardware] = useState([DEFAULT_HARDWARE])
     const [[optimize], setOptimize] = useState([DEFAULT_OPT])
+    const [textcatExclusive, setTextcatExclusive] = useState(DEFAULT_TEXTCAT_EXCLUSIVE)
+
+    function updateComponents(value, isExclusive) {
+        _setComponents(value)
+        const updated = value.map(c => (c === 'textcat' && !isExclusive ? 'textcat_multilabel' : c))
+        setComponents(updated)
+    }
+
     const setters = {
         lang: setLang,
-        components: setComponents,
+        components: v => updateComponents(v, textcatExclusive),
         hardware: setHardware,
         optimize: setOptimize,
+        textcat: v => {
+            const isExclusive = v.includes('exclusive')
+            setTextcatExclusive(isExclusive)
+            updateComponents(_components, isExclusive)
+        },
     }
     const reco = GENERATOR_DATA[lang] || GENERATOR_DATA.__default__
     const content = generator({
@@ -78,20 +106,24 @@ export default function QuickstartTraining({ id, title, download = 'base_config.
         <StaticQuery
             query={query}
             render={({ site }) => {
+                let data = DATA
                 const langs = site.siteMetadata.languages
-                DATA[0].dropdown = langs
+                data[0].dropdown = langs
                     .map(({ name, code }) => ({
                         id: code,
                         title: name,
                     }))
                     .sort((a, b) => a.title.localeCompare(b.title))
+                if (!_components.includes('textcat')) {
+                    data = data.filter(({ id }) => id !== 'textcat')
+                }
                 return (
                     <Quickstart
                         id="quickstart-widget"
                         Container="div"
                         download={download}
                         rawContent={rawContent}
-                        data={DATA}
+                        data={data}
                         title={title}
                         id={id}
                         setters={setters}

From 4f32e3dedbdc556ed144646435ded4ad32fca52d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 10 Mar 2021 01:08:05 +1100
Subject: [PATCH 021/146] Update issue templates [ci skip]

---
 .../ISSUE_TEMPLATE/{03_docs.md => 02_docs.md} |  0
 .github/ISSUE_TEMPLATE/02_install.md          | 21 -------------------
 .../{04_other.md => 03_other.md}              |  0
 3 files changed, 21 deletions(-)
 rename .github/ISSUE_TEMPLATE/{03_docs.md => 02_docs.md} (100%)
 delete mode 100644 .github/ISSUE_TEMPLATE/02_install.md
 rename .github/ISSUE_TEMPLATE/{04_other.md => 03_other.md} (100%)

diff --git a/.github/ISSUE_TEMPLATE/03_docs.md b/.github/ISSUE_TEMPLATE/02_docs.md
similarity index 100%
rename from .github/ISSUE_TEMPLATE/03_docs.md
rename to .github/ISSUE_TEMPLATE/02_docs.md
diff --git a/.github/ISSUE_TEMPLATE/02_install.md b/.github/ISSUE_TEMPLATE/02_install.md
deleted file mode 100644
index d0790bbdb..000000000
--- a/.github/ISSUE_TEMPLATE/02_install.md
+++ /dev/null
@@ -1,21 +0,0 @@
----
-name: "\U000023F3 Installation Problem"
-about: Do you have problems installing spaCy, and none of the suggestions in the docs
-  and other issues helped?
-
----
-<!-- Before submitting an issue, make sure to check the docs and closed issues to see if any of the solutions work for you. Installation problems can often be related to Python environment issues and problems with compilation. -->
-
-## How to reproduce the problem
-<!-- Include the details of how the problem occurred. Which command did you run to install spaCy? Did you come across an error? What else did you try? -->
-
-```bash
-# copy-paste the error message here
-```
-
-## Your Environment
-<!-- Include details of your environment. If you're using spaCy 1.7+, you can also type `python -m spacy info --markdown` and copy-paste the result here.-->
-* Operating System:
-* Python Version Used:
-* spaCy Version Used:
-* Environment Information:
diff --git a/.github/ISSUE_TEMPLATE/04_other.md b/.github/ISSUE_TEMPLATE/03_other.md
similarity index 100%
rename from .github/ISSUE_TEMPLATE/04_other.md
rename to .github/ISSUE_TEMPLATE/03_other.md

From d746ea6278b3419986c1e6a8359b236a47ab7abc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 9 Mar 2021 15:35:21 +0100
Subject: [PATCH 022/146] Add warning about GPU selection in Jupyter notebooks
 (#7075)

* Initial warning

* Update check

* Redo edit

* Move jupyter warning to helper method

* Add link with details to warnings
---
 spacy/errors.py               |  5 +++++
 spacy/language.py             |  5 +++++
 spacy/util.py                 | 12 ++++++++++++
 website/docs/api/top-level.md | 24 ++++++++++++++++++++++++
 website/docs/usage/v3.md      | 12 ++++++++++++
 5 files changed, 58 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index e50a658d8..4f9e90b57 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -147,6 +147,11 @@ class Warnings:
             "will be included in the results. For better results, token "
             "patterns should return matches that are each exactly one token "
             "long.")
+    W111 = ("Jupyter notebook detected: if using `prefer_gpu()` or "
+            "`require_gpu()`, include it in the same cell right before "
+            "`spacy.load()` to ensure that the model is loaded on the correct "
+            "device. More information: "
+            "http://spacy.io/usage/v3#jupyter-notebook-gpu")
 
 
 @add_codes
diff --git a/spacy/language.py b/spacy/language.py
index 5741ef97c..871dfafaa 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -22,6 +22,7 @@ from .training.initialize import init_vocab, init_tok2vec
 from .scorer import Scorer
 from .util import registry, SimpleFrozenList, _pipe, raise_error
 from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
+from .util import warn_if_jupyter_cupy
 from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 from .lang.punctuation import TOKENIZER_INFIXES
@@ -1622,6 +1623,10 @@ class Language:
                 or lang_cls is not cls
             ):
                 raise ValueError(Errors.E943.format(value=type(lang_cls)))
+
+        # Warn about require_gpu usage in jupyter notebook
+        warn_if_jupyter_cupy()
+
         # Note that we don't load vectors here, instead they get loaded explicitly
         # inside stuff like the spacy train function. If we loaded them here,
         # then we would load them twice at runtime: once when we make from config,
diff --git a/spacy/util.py b/spacy/util.py
index bcb51fe7d..4b82eea8d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1500,3 +1500,15 @@ def raise_error(proc_name, proc, docs, e):
 
 def ignore_error(proc_name, proc, docs, e):
     pass
+
+
+def warn_if_jupyter_cupy():
+    """Warn about require_gpu if a jupyter notebook + cupy + mismatched
+    contextvars vs. thread ops are detected
+    """
+    if is_in_jupyter():
+        from thinc.backends.cupy_ops import CupyOps
+        if CupyOps.xp is not None:
+            from thinc.backends import contextvars_eq_thread_ops
+            if not contextvars_eq_thread_ops():
+                warnings.warn(Warnings.W111)
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 37f619f3e..e1d81a5b5 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -138,6 +138,14 @@ data has already been allocated on CPU, it will not be moved. Ideally, this
 function should be called right after importing spaCy and _before_ loading any
 pipelines.
 
+<Infobox variant="warning" title="Jupyter notebook usage">
+
+In a Jupyter notebook, run `prefer_gpu()` in the same cell as `spacy.load()`
+to ensure that the model is loaded on the correct device. See [more
+details](/usage/v3#jupyter-notebook-gpu).
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -158,6 +166,14 @@ if no GPU is available. If data has already been allocated on CPU, it will not
 be moved. Ideally, this function should be called right after importing spaCy
 and _before_ loading any pipelines.
 
+<Infobox variant="warning" title="Jupyter notebook usage">
+
+In a Jupyter notebook, run `require_gpu()` in the same cell as `spacy.load()`
+to ensure that the model is loaded on the correct device. See [more
+details](/usage/v3#jupyter-notebook-gpu).
+
+</Infobox>
+
 > #### Example
 >
 > ```python
@@ -177,6 +193,14 @@ Allocate data and perform operations on CPU. If data has already been allocated
 on GPU, it will not be moved. Ideally, this function should be called right
 after importing spaCy and _before_ loading any pipelines.
 
+<Infobox variant="warning" title="Jupyter notebook usage">
+
+In a Jupyter notebook, run `require_cpu()` in the same cell as `spacy.load()`
+to ensure that the model is loaded on the correct device. See [more
+details](/usage/v3#jupyter-notebook-gpu).
+
+</Infobox>
+
 > #### Example
 >
 > ```python
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 21e99ffc2..847d4a327 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -1179,3 +1179,15 @@ This means that spaCy knows how to initialize `my_component`, even if your
 package isn't imported.
 
 </Infobox>
+
+#### Using GPUs in Jupyter notebooks {#jupyter-notebook-gpu}
+
+In Jupyter notebooks, run [`prefer_gpu`](/api/top-level#spacy.prefer_gpu),
+[`require_gpu`](/api/top-level#spacy.require_gpu) or
+[`require_cpu`](/api/top-level#spacy.require_cpu) in the same cell as
+[`spacy.load`](/api/top-level#spacy.load) to ensure that the model is loaded on the correct device.
+
+Due to a bug related to `contextvars` (see the [bug
+report](https://github.com/ipython/ipython/issues/11565)), the GPU settings may
+not be preserved correctly across cells, resulting in models being loaded on
+the wrong device or only partially on GPU.

From 3b911ee5ef2240919b66a0ce55a5d387ceb6f904 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 9 Mar 2021 16:49:41 +0100
Subject: [PATCH 023/146] Set version to v3.0.4 (#7376)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index c19e1aeaa..4cbfdbad3 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.3"
+__version__ = "3.0.4"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 53a3b967ac704ff0a67a7102ede6d916e2a4545a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 10 Mar 2021 11:10:53 +0100
Subject: [PATCH 024/146] Update thinc pin and set version to v3.0.5 (#7389)

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 spacy/about.py   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3113cf6c5..f00fdc9f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.0,<8.1.0",
+    "thinc>=8.0.2,<8.1.0",
     "blis>=0.4.0,<0.8.0",
     "pathy",
     "numpy>=1.15.0",
diff --git a/requirements.txt b/requirements.txt
index 01a3be120..e09a5b221 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 spacy-legacy>=3.0.0,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.0,<8.1.0
+thinc>=8.0.2,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index 482c1fbdd..09f989c54 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,14 +34,14 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.0,<8.1.0
+    thinc>=8.0.2,<8.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.0,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.0,<8.1.0
+    thinc>=8.0.2,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.0,<3.0.0
diff --git a/spacy/about.py b/spacy/about.py
index 4cbfdbad3..2987f3c53 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.4"
+__version__ = "3.0.5"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 28726c25a19248b06b59c5ca759410b84b70668c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 10 Mar 2021 11:42:02 +0100
Subject: [PATCH 025/146] Update docs for convert CLI and NER examples

---
 extra/example_data/ner_example_data/README.md | 20 ++++++++++++++++-
 website/docs/api/cli.md                       | 22 +++++++++----------
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/extra/example_data/ner_example_data/README.md b/extra/example_data/ner_example_data/README.md
index af70694f5..3c6a4a86b 100644
--- a/extra/example_data/ner_example_data/README.md
+++ b/extra/example_data/ner_example_data/README.md
@@ -1,7 +1,25 @@
 ## Examples of NER/IOB data that can be converted with `spacy convert`
 
-spacy JSON training files were generated with:
+To convert an IOB file to `.spacy` ([`DocBin`](https://spacy.io/api/docbin))
+for spaCy v3:
 
+```bash
+python -m spacy convert -c iob -s -n 10 -b en_core_web_sm file.iob .
 ```
+
+See all the `spacy convert` options: https://spacy.io/api/cli#convert
+
+---
+
+The spaCy v2 JSON training files were generated using **spaCy v2** with:
+
+```bash
 python -m spacy convert -c iob -s -n 10 -b en file.iob
 ```
+
+To convert an existing JSON training file to `.spacy` for spaCy v3, convert
+with **spaCy v3**:
+
+```bash
+python -m spacy convert file.json .
+```
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index e8be0f79c..fd149b285 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -261,24 +261,24 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 | `output_dir`                                     | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~        |
 | `--converter`, `-c` <Tag variant="new">2</Tag>   | Name of converter to use (see below). ~~str (option)~~                                                                                    |
 | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
-| `--n-sents`, `-n`                                | Number of sentences per document. ~~int (option)~~                                                                                        |
-| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences (for `--converter ner`). ~~bool (flag)~~                                                                                |
+| `--n-sents`, `-n`                                | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~                                         |
+| `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~                                                                          |
 | `--base`, `-b`                                   | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~                            |
-| `--morphology`, `-m`                             | Enable appending morphology to tags. ~~bool (flag)~~                                                                                      |
-| `--ner-map`, `-nm`                               | NER tag mapping (as JSON-encoded dict of entity types). ~~Optional[Path](option)~~                                                        |
+| `--morphology`, `-m`                             | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~                                                              |
+| `--ner-map`, `-nm`                               | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~                                |
 | `--lang`, `-l` <Tag variant="new">2.1</Tag>      | Language code (if tokenizer required). ~~Optional[str] \(option)~~                                                                        |
 | `--help`, `-h`                                   | Show help message and available arguments. ~~bool (flag)~~                                                                                |
 | **CREATES**                                      | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train).                                       |
 
 ### Converters {#converters}
 
-| ID      | Description                                                                                                                                                                                                                                                                                                                                                     |
-| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `auto`  | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                |
-| `json`  | JSON-formatted training data used in spaCy v2.x.                                                                                                                                                                                                                                                                                                                |
-| `conll` | Universal Dependencies `.conllu` or `.conll` format.                                                                                                                                                                                                                                                                                                            |
-| `ner`   | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). |
-| `iob`   | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data).                                                                                                                            |
+| ID              | Description                                                                                                                                                                                                                                                                                                                                                     |
+| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `auto`          | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                |
+| `json`          | JSON-formatted training data used in spaCy v2.x.                                                                                                                                                                                                                                                                                                                |
+| `conllu`        | Universal Dependencies `.conllu` format.                                                                                                                                                                                                                                                                                                                        |
+| `ner` / `conll` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). |
+| `iob`           | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data).                                                                                                                            |
 
 ## debug {#debug new="3"}
 

From fbf3a755d7af0afc32fb7f7d83d4b9933ed724e4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 11 Mar 2021 09:36:58 +0100
Subject: [PATCH 026/146] Make spacy.load kwargs keyword-only

---
 spacy/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 36074c440..cd5a40406 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -28,6 +28,7 @@ if sys.maxunicode == 65535:
 
 def load(
     name: Union[str, Path],
+    *,
     disable: Iterable[str] = util.SimpleFrozenList(),
     exclude: Iterable[str] = util.SimpleFrozenList(),
     config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),

From 4294bcf4ab6ab1a45ff05adf05ca369fa02bdc81 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 11 Mar 2021 09:52:40 +0100
Subject: [PATCH 027/146] Align keyword-only in docs for init/util

---
 website/docs/api/top-level.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index e1d81a5b5..cf9a58941 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -140,9 +140,9 @@ pipelines.
 
 <Infobox variant="warning" title="Jupyter notebook usage">
 
-In a Jupyter notebook, run `prefer_gpu()` in the same cell as `spacy.load()`
-to ensure that the model is loaded on the correct device. See [more
-details](/usage/v3#jupyter-notebook-gpu).
+In a Jupyter notebook, run `prefer_gpu()` in the same cell as `spacy.load()` to
+ensure that the model is loaded on the correct device. See
+[more details](/usage/v3#jupyter-notebook-gpu).
 
 </Infobox>
 
@@ -168,9 +168,9 @@ and _before_ loading any pipelines.
 
 <Infobox variant="warning" title="Jupyter notebook usage">
 
-In a Jupyter notebook, run `require_gpu()` in the same cell as `spacy.load()`
-to ensure that the model is loaded on the correct device. See [more
-details](/usage/v3#jupyter-notebook-gpu).
+In a Jupyter notebook, run `require_gpu()` in the same cell as `spacy.load()` to
+ensure that the model is loaded on the correct device. See
+[more details](/usage/v3#jupyter-notebook-gpu).
 
 </Infobox>
 
@@ -195,9 +195,9 @@ after importing spaCy and _before_ loading any pipelines.
 
 <Infobox variant="warning" title="Jupyter notebook usage">
 
-In a Jupyter notebook, run `require_cpu()` in the same cell as `spacy.load()`
-to ensure that the model is loaded on the correct device. See [more
-details](/usage/v3#jupyter-notebook-gpu).
+In a Jupyter notebook, run `require_cpu()` in the same cell as `spacy.load()` to
+ensure that the model is loaded on the correct device. See
+[more details](/usage/v3#jupyter-notebook-gpu).
 
 </Infobox>
 
@@ -945,7 +945,8 @@ and create a `Language` object. The model data will then be loaded in via
 | Name                                 | Description                                                                                                                                                                                                                                      |
 | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `name`                               | Package name or path. ~~str~~                                                                                                                                                                                                                    |
-| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                                                                                                           |
+| _keyword-only_                       |                                                                                                                                                                                                                                                  |
+| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                                                                                                            |
 | `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
 | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                             |
 | `config` <Tag variant="new">3</Tag>  | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~                                                                                                   |
@@ -968,6 +969,7 @@ A helper function to use in the `load()` method of a pipeline package's
 | Name                                 | Description                                                                                                                                                                                                                                    |
 | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `init_file`                          | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~                                                                                                                                                                         |
+| _keyword-only_                       |                                                                                                                                                                                                                                                |
 | `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                                                                                                         |
 | `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
 | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                           |

From 84470d9b9e65bd1843dd250e5d94bb44fd87469e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 11 Mar 2021 10:10:58 +0100
Subject: [PATCH 028/146] Incorporate BILUO note from #7407

---
 website/docs/api/cli.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index fd149b285..44a8e2fc2 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -272,13 +272,13 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 
 ### Converters {#converters}
 
-| ID              | Description                                                                                                                                                                                                                                                                                                                                                     |
-| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `auto`          | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                |
-| `json`          | JSON-formatted training data used in spaCy v2.x.                                                                                                                                                                                                                                                                                                                |
-| `conllu`        | Universal Dependencies `.conllu` format.                                                                                                                                                                                                                                                                                                                        |
-| `ner` / `conll` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). |
-| `iob`           | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data).                                                                                                                            |
+| ID              | Description                                                                                                                                                                                                                                                                                                                                                           |
+| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `auto`          | Automatically pick converter based on file extension and file content (default).                                                                                                                                                                                                                                                                                      |
+| `json`          | JSON-formatted training data used in spaCy v2.x.                                                                                                                                                                                                                                                                                                                      |
+| `conllu`        | Universal Dependencies `.conllu` format.                                                                                                                                                                                                                                                                                                                              |
+| `ner` / `conll` | NER with IOB/IOB2/BILUO tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the NER tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data). |
+| `iob`           | NER with IOB/IOB2/BILUO tags, one sentence per line with tokens separated by whitespace and annotation separated by `\|`, either `word\|B-ENT`or`word\|POS\|B-ENT`. See [sample data](%%GITHUB_SPACY/extra/example_data/ner_example_data).                                                                                                                            |
 
 ## debug {#debug new="3"}
 

From 124304b14672cb3d82c495b0fd45f60ecca90ea8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 11 Mar 2021 10:58:59 +0100
Subject: [PATCH 029/146] Add vocab kwarg back to spacy.load

* Additional minor formatting and docs cleanup
---
 spacy/__init__.py             |  8 ++++++--
 website/docs/api/top-level.md | 19 ++++++++++---------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index cd5a40406..1eef7e621 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -29,6 +29,7 @@ if sys.maxunicode == 65535:
 def load(
     name: Union[str, Path],
     *,
+    vocab: Union[Vocab, bool] = True,
     disable: Iterable[str] = util.SimpleFrozenList(),
     exclude: Iterable[str] = util.SimpleFrozenList(),
     config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
@@ -36,6 +37,7 @@ def load(
     """Load a spaCy model from an installed package or a local path.
 
     name (str): Package name or model path.
+    vocab (Vocab): A Vocab object. If True, a vocab is created.
     disable (Iterable[str]): Names of pipeline components to disable. Disabled
         pipes will be loaded but they won't be run unless you explicitly
         enable them by calling nlp.enable_pipe.
@@ -45,7 +47,9 @@ def load(
         keyed by section values in dot notation.
     RETURNS (Language): The loaded nlp object.
     """
-    return util.load_model(name, disable=disable, exclude=exclude, config=config)
+    return util.load_model(
+        name, vocab=vocab, disable=disable, exclude=exclude, config=config
+    )
 
 
 def blank(
@@ -53,7 +57,7 @@ def blank(
     *,
     vocab: Union[Vocab, bool] = True,
     config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
-    meta: Dict[str, Any] = util.SimpleFrozenDict()
+    meta: Dict[str, Any] = util.SimpleFrozenDict(),
 ) -> Language:
     """Create a blank nlp object for a given language code.
 
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index cf9a58941..eef8958cf 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -48,6 +48,7 @@ specified separately using the new `exclude` keyword argument.
 | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `name`                               | Pipeline to load, i.e. package name or path. ~~Union[str, Path]~~                                                                                                                                                                              |
 | _keyword-only_                       |                                                                                                                                                                                                                                                |
+| `vocab`                              | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                                                                                                          |
 | `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
 | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                           |
 | `config` <Tag variant="new">3</Tag>  | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~                                                                             |
@@ -83,9 +84,9 @@ Create a blank pipeline of a given language class. This function is the twin of
 | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `name`                              | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~                                                           |
 | _keyword-only_                      |                                                                                                                                                                    |
-| `vocab` <Tag variant="new">3</Tag>  | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                             |
+| `vocab`                             | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                              |
 | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
-| `meta` <Tag variant="new">3</Tag>   | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
+| `meta`                              | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
 | **RETURNS**                         | An empty `Language` object of the appropriate subclass. ~~Language~~                                                                                               |
 
 ### spacy.info {#spacy.info tag="function"}
@@ -946,7 +947,7 @@ and create a `Language` object. The model data will then be loaded in via
 | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `name`                               | Package name or path. ~~str~~                                                                                                                                                                                                                    |
 | _keyword-only_                       |                                                                                                                                                                                                                                                  |
-| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                                                                                                            |
+| `vocab`                              | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                                                                                                            |
 | `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
 | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                             |
 | `config` <Tag variant="new">3</Tag>  | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~                                                                                                   |
@@ -970,7 +971,7 @@ A helper function to use in the `load()` method of a pipeline package's
 | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `init_file`                          | Path to package's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~                                                                                                                                                                         |
 | _keyword-only_                       |                                                                                                                                                                                                                                                |
-| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                                                                                                         |
+| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~                                                                                                          |
 | `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
 | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                           |
 | `config` <Tag variant="new">3</Tag>  | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~                                                                                                 |
@@ -1149,11 +1150,11 @@ vary on each step.
 >     nlp.update(batch)
 > ```
 
-| Name       | Description                              |
-| ---------- | ---------------------------------------- |
-| `items`    | The items to batch up. ~~Iterable[Any]~~ |
-| `size`     | int / iterable                           | The batch size(s). ~~Union[int, Sequence[int]]~~ |
-| **YIELDS** | The batches.                             |
+| Name       | Description                                      |
+| ---------- | ------------------------------------------------ |
+| `items`    | The items to batch up. ~~Iterable[Any]~~         |
+| `size`     | The batch size(s). ~~Union[int, Sequence[int]]~~ |
+| **YIELDS** | The batches.                                     |
 
 ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
 

From deffc3a5321fcb21f3ff4b0bed23deea81f81f12 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 11 Mar 2021 16:24:31 +0100
Subject: [PATCH 030/146] Update package requirements tests (#7409)

* Add hypothesis to packages skipped in version check

* Add numpy back to tests following 2df1ab8a
---
 spacy/tests/package/test_requirements.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index a0e43ccfa..82c39b72c 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -6,15 +6,14 @@ def test_build_dependencies():
     # Check that library requirements are pinned exactly the same across different setup files.
     # TODO: correct checks for numpy rather than ignoring
     libs_ignore_requirements = [
-        "numpy",
         "pytest",
         "pytest-timeout",
         "mock",
         "flake8",
+        "hypothesis",
     ]
     # ignore language-specific packages that shouldn't be installed by all
     libs_ignore_setup = [
-        "numpy",
         "fugashi",
         "natto-py",
         "pythainlp",

From 81efde0ce401f3004e23fc1ed794d445b8cafa51 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 11 Mar 2021 19:49:46 +0100
Subject: [PATCH 031/146] Add examples README

---
 examples/README.md          | 130 ++++++++++++++++++++++++++++++++++++
 examples/training/README.md |   5 ++
 2 files changed, 135 insertions(+)
 create mode 100644 examples/README.md
 create mode 100644 examples/training/README.md

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 000000000..23ff59acd
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,130 @@
+<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
+
+# spaCy examples
+
+For spaCy v3 we've converted many of the [v2 example
+scripts](https://github.com/explosion/spaCy/tree/v2.3.x/examples/) into
+end-to-end [spacy projects](https://spacy.io/usage/projects) workflows. The
+workflows include all the steps to go from data to packaged spaCy models.
+
+## 🪐 Pipeline component demos
+
+The simplest demos for training a single pipeline component are in the
+[`pipelines`](https://github.com/explosion/projects/blob/v3/pipelines) category
+including:
+
+- [`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo):
+  Train a named entity recognizer
+- [`pipelines/textcat_demo`](https://github.com/explosion/projects/blob/v3/pipelines/textcat_demo):
+  Train a text classifier
+- [`pipelines/parser_intent_demo`](https://github.com/explosion/projects/blob/v3/pipelines/parser_intent_demo):
+  Train a dependency parser for custom semantics
+
+## 🪐 Tutorials
+
+The [`tutorials`](https://github.com/explosion/projects/blob/v3/tutorials)
+category includes examples that work through specific NLP use cases end-to-end:
+
+- [`tutorials/textcat_goemotions`](https://github.com/explosion/projects/blob/v3/tutorials/textcat_goemotions):
+  Train a text classifier to categorize emotions in Reddit posts
+- [`tutorials/nel_emerson`](https://github.com/explosion/projects/blob/v3/tutorials/nel_emerson):
+  Use an entity linker to disambiguate mentions of the same name
+
+Check out the [projects documentation](https://spacy.io/usage/projects) and
+browse through the [available
+projects](https://github.com/explosion/projects/)!
+
+## 🚀 Get started with a demo project
+
+The
+[`pipelines/ner_demo`](https://github.com/explosion/projects/blob/v3/pipelines/ner_demo)
+project converts the spaCy v2
+[`train_ner.py`](https://github.com/explosion/spaCy/blob/v2.3.x/examples/training/train_ner.py)
+demo script into a spaCy v3 project.
+
+1. Clone the project:
+
+   ```bash
+   python -m spacy project clone pipelines/ner_demo
+   ```
+
+2. Install requirements and download any data assets:
+
+   ```bash
+   cd ner_demo
+   python -m pip install -r requirements.txt
+   python -m spacy project assets
+   ```
+
+3. Run the default workflow to convert, train and evaluate:
+
+   ```bash
+   python -m spacy project run all
+   ```
+
+   Sample output:
+
+   ```none
+   ℹ Running workflow 'all'
+   
+   ================================== convert ==================================
+   Running command: /home/user/venv/bin/python scripts/convert.py en assets/train.json corpus/train.spacy
+   Running command: /home/user/venv/bin/python scripts/convert.py en assets/dev.json corpus/dev.spacy
+   
+   =============================== create-config ===============================
+   Running command: /home/user/venv/bin/python -m spacy init config --lang en --pipeline ner configs/config.cfg --force
+   ℹ Generated config template specific for your use case
+   - Language: en
+   - Pipeline: ner
+   - Optimize for: efficiency
+   - Hardware: CPU
+   - Transformer: None
+   ✔ Auto-filled config with all values
+   ✔ Saved config
+   configs/config.cfg
+   You can now add your data and train your pipeline:
+   python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
+   
+   =================================== train ===================================
+   Running command: /home/user/venv/bin/python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --training.eval_frequency 10 --training.max_steps 100 --gpu-id -1
+   ℹ Using CPU
+   
+   =========================== Initializing pipeline ===========================
+   [2021-03-11 19:34:59,101] [INFO] Set up nlp object from config
+   [2021-03-11 19:34:59,109] [INFO] Pipeline: ['tok2vec', 'ner']
+   [2021-03-11 19:34:59,113] [INFO] Created vocabulary
+   [2021-03-11 19:34:59,113] [INFO] Finished initializing nlp object
+   [2021-03-11 19:34:59,265] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
+   ✔ Initialized pipeline
+   
+   ============================= Training pipeline =============================
+   ℹ Pipeline: ['tok2vec', 'ner']
+   ℹ Initial learn rate: 0.001
+   E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
+   ---  ------  ------------  --------  ------  ------  ------  ------
+     0       0          0.00      7.90    0.00    0.00    0.00    0.00
+    10      10          0.11     71.07    0.00    0.00    0.00    0.00
+    20      20          0.65     22.44   50.00   50.00   50.00    0.50
+    30      30          0.22      6.38   80.00   66.67  100.00    0.80
+    40      40          0.00      0.00   80.00   66.67  100.00    0.80
+    50      50          0.00      0.00   80.00   66.67  100.00    0.80
+    60      60          0.00      0.00  100.00  100.00  100.00    1.00
+    70      70          0.00      0.00  100.00  100.00  100.00    1.00
+    80      80          0.00      0.00  100.00  100.00  100.00    1.00
+    90      90          0.00      0.00  100.00  100.00  100.00    1.00
+   100     100          0.00      0.00  100.00  100.00  100.00    1.00
+   ✔ Saved pipeline to output directory
+   training/model-last
+   ```
+
+4. Package the model:
+
+   ```bash
+   python -m spacy project run package
+   ```
+
+5. Visualize the model's output with [Streamlit](https://streamlit.io):
+
+   ```bash
+   python -m spacy project run visualize-model
+   ```
diff --git a/examples/training/README.md b/examples/training/README.md
new file mode 100644
index 000000000..34689ceb6
--- /dev/null
+++ b/examples/training/README.md
@@ -0,0 +1,5 @@
+<a href="https://explosion.ai"><img src="https://explosion.ai/assets/img/logo.svg" width="125" height="125" align="right" /></a>
+
+# spaCy examples
+
+See [examples/README.md](../README.md)

From 508cb3bef75079cb132b4a9754a197de421d07f8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 12 Mar 2021 09:41:59 +0100
Subject: [PATCH 032/146] Also exclude user hooks in displacy conversion
 (#7419)

---
 spacy/displacy/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 2049809a7..aa61fb9f7 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -120,7 +120,7 @@ def parse_deps(orig_doc: Doc, options: Dict[str, Any] = {}) -> Dict[str, Any]:
     doc (Doc): Document do parse.
     RETURNS (dict): Generated dependency parse keyed by words and arcs.
     """
-    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data"]))
+    doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes(exclude=["user_data", "user_hooks"]))
     if not doc.has_annotation("DEP"):
         warnings.warn(Warnings.W005)
     if options.get("collapse_phrases", False):

From ce6317231f047fcfc946fa9139c7f56ded9cb84f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 12 Mar 2021 09:51:26 +0100
Subject: [PATCH 033/146] Add --code to spacy debug CLI

---
 spacy/cli/debug_config.py | 2 +-
 spacy/cli/debug_data.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py
index 549072a1e..56ee12336 100644
--- a/spacy/cli/debug_config.py
+++ b/spacy/cli/debug_config.py
@@ -20,7 +20,7 @@ def debug_config_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
     show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
     # fmt: on
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 316b615c5..be11f8d1c 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -39,7 +39,7 @@ def debug_data_cli(
     # fmt: off
     ctx: typer.Context,  # This is only used to read additional arguments
     config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
-    code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
     verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),
     no_format: bool = Opt(False, "--no-format", "-NF", help="Don't pretty-print the results"),

From 03e9e7b567dc05b428fd689dab2f997c260875ab Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 12 Mar 2021 10:00:31 +0100
Subject: [PATCH 034/146] Add --code option to init fill-config

---
 spacy/cli/init_config.py |  7 +++++--
 website/docs/api/cli.md  | 17 +++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/spacy/cli/init_config.py b/spacy/cli/init_config.py
index 9880c389c..55622452b 100644
--- a/spacy/cli/init_config.py
+++ b/spacy/cli/init_config.py
@@ -10,7 +10,8 @@ from jinja2 import Template
 from .. import util
 from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
 from ..schemas import RecommendationSchema
-from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list
+from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
+from ._util import string_to_list, import_code
 
 
 ROOT = Path(__file__).parent / "templates"
@@ -70,7 +71,8 @@ def init_fill_config_cli(
     base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
     output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
     pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
-    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
+    diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
+    code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
     # fmt: on
 ):
     """
@@ -82,6 +84,7 @@ def init_fill_config_cli(
 
     DOCS: https://spacy.io/api/cli#init-fill-config
     """
+    import_code(code_path)
     fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
 
 
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index e8be0f79c..8564eff43 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -170,14 +170,15 @@ validation error with more details.
 $ python -m spacy init fill-config [base_path] [output_file] [--diff]
 ```
 
-| Name                   | Description                                                                                                                         |
-| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~           |
-| `output_file`          | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
-| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                   |
-| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                       |
-| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                          |
-| **CREATES**            | Complete and auto-filled config file for training.                                                                                  |
+| Name                   | Description                                                                                                                                                                          |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `base_path`            | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~                                                            |
+| `output_file`          | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~                                                  |
+| `--code`, `-c`         | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~                                                                    |
+| `--diff`, `-D`         | Print a visual diff highlighting the changes. ~~bool (flag)~~                                                                                                                        |
+| `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
+| **CREATES**            | Complete and auto-filled config file for training.                                                                                                                                   |
 
 ### init vectors {#init-vectors new="3" tag="command"}
 

From 316810360558b3581f0132baa37072b3cb597dd7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 12 Mar 2021 10:04:57 +0100
Subject: [PATCH 035/146] Fix type of spacy train --output in docs

---
 website/docs/api/cli.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8564eff43..56d69ad6d 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -806,7 +806,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | Name              | Description                                                                                                                                                                                                        |
 | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`     | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
-| `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                                                                                      |
+| `--output`, `-o`  | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                          |
 | `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~                               |
 | `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~                                                                                                                                                       |
 | `--gpu-id`, `-g`  | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |

From 61472e7cb385c6ca578dce2f4301fb27666e058b Mon Sep 17 00:00:00 2001
From: bsweileh <42196212+bsweileh@users.noreply.github.com>
Date: Mon, 15 Mar 2021 02:21:35 -0600
Subject: [PATCH 036/146] Update _training.md - Fix broken link on
 backpropagation (#7431)

* Update _training.md

Fix broken link on backpropagation

* Add agreement

add spacy contributor agreement
---
 .github/contributors/bsweileh.md    | 106 ++++++++++++++++++++++++++++
 website/docs/usage/101/_training.md |   2 +-
 2 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 .github/contributors/bsweileh.md

diff --git a/.github/contributors/bsweileh.md b/.github/contributors/bsweileh.md
new file mode 100644
index 000000000..13f78a4b7
--- /dev/null
+++ b/.github/contributors/bsweileh.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |  Belal               |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |  March 13, 2021      |
+| GitHub username                |  bsweileh            |
+| Website (optional)             |                      |
diff --git a/website/docs/usage/101/_training.md b/website/docs/usage/101/_training.md
index b73a83d6a..4218c1b5a 100644
--- a/website/docs/usage/101/_training.md
+++ b/website/docs/usage/101/_training.md
@@ -10,7 +10,7 @@ any other information.
 Training is an iterative process in which the model's predictions are compared
 against the reference annotations in order to estimate the **gradient of the
 loss**. The gradient of the loss is then used to calculate the gradient of the
-weights through [backpropagation](https://thinc.ai/backprop101). The gradients
+weights through [backpropagation](https://thinc.ai/docs/backprop101). The gradients
 indicate how the weight values should be changed so that the model's predictions
 become more similar to the reference labels over time.
 

From 3bcf74aca7b35680a81c4239a6823aa5f46c429a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 15 Mar 2021 11:11:06 +0100
Subject: [PATCH 037/146] Rename and update ru pymorphy2 lookup lemmatize

* To allow default lookup lemmatization with a blank Russian model,
rename pymorphy2 lookup mode to `pymorphy2_lookup`

* Bug fix: update pymorphy2 lookup lemmatize to return list rather than
string
---
 spacy/lang/ru/lemmatizer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index e4689815e..c337b9bc3 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -90,12 +90,12 @@ class RussianLemmatizer(Lemmatizer):
             return [string.lower()]
         return list(set([analysis.normal_form for analysis in filtered_analyses]))
 
-    def lookup_lemmatize(self, token: Token) -> List[str]:
+    def pymorphy2_lookup_lemmatize(self, token: Token) -> List[str]:
         string = token.text
         analyses = self._morph.parse(string)
         if len(analyses) == 1:
-            return analyses[0].normal_form
-        return string
+            return [analyses[0].normal_form]
+        return [string]
 
 
 def oc2ud(oc_tag: str) -> Tuple[str, Dict[str, str]]:

From 02b5c8a1a2e49add3eaa5434678d513861dd00ab Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 16 Mar 2021 09:48:31 +0100
Subject: [PATCH 038/146] Add py.typed

---
 MANIFEST.in    | 1 +
 spacy/py.typed | 0
 2 files changed, 1 insertion(+)
 create mode 100644 spacy/py.typed

diff --git a/MANIFEST.in b/MANIFEST.in
index b4887cdb8..8008b4507 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,6 +3,7 @@ recursive-include spacy *.pyx *.pxd *.txt *.cfg *.jinja
 include LICENSE
 include README.md
 include pyproject.toml
+include spacy/py.typed
 recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
diff --git a/spacy/py.typed b/spacy/py.typed
new file mode 100644
index 000000000..e69de29bb

From 00e59be966f1710f4245af68b103033786e3f884 Mon Sep 17 00:00:00 2001
From: Paolo Arduin <paolo.arduin@errequadrosrl.com>
Date: Tue, 16 Mar 2021 18:22:03 +0100
Subject: [PATCH 039/146] Add SpikeX to spaCy universe

---
 website/meta/universe.json | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index f67b7c219..db7657591 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,36 @@
 {
     "resources": [
+        {
+            "id": "spikex",
+            "title": "SpikeX - SpaCy Pipes for Knowledge Extraction",
+            "slogan": "Use SpikeX to build knowledge extraction tools with almost-zero effort",
+            "description": "SpikeX is a collection of pipes ready to be plugged in a spaCy pipeline. It aims to help in building knowledge extraction tools with almost-zero effort.",
+            "github": "erre-quadro/spikex",
+            "pip": "spikex",
+            "code_example": [
+                "from spacy import load as spacy_load",
+                "from spikex.wikigraph import load as wg_load",
+                "from spikex.pipes import WikiPageX",
+                "",
+                "# load a spacy model and get a doc",
+                "nlp = spacy_load('en_core_web_sm')",
+                "doc = nlp('An apple a day keeps the doctor away')",
+                "# load a WikiGraph",
+                "wg = wg_load('simplewiki_core')",
+                "# get a WikiPageX and extract all pages",
+                "wikipagex = WikiPageX(wg)",
+                "doc = wikipagex(doc)",
+                "# see all pages extracted from the doc",
+                "for span in doc._.wiki_spans:",
+                "   print(span._.wiki_pages)"
+            ],
+            "category": ["pipeline", "standalone"],
+            "author": "Erre Quadro",
+            "author_links": {
+                "github": "erre-quadro",
+                "website": "https://www.errequadrosrl.com"
+            }
+        },
         {
             "id": "spacy-dbpedia-spotlight",
             "title": "DBpedia Spotlight for SpaCy",

From ef77c886388713f7651daced6c90c63165a88d6b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 17 Mar 2021 14:56:04 +0900
Subject: [PATCH 040/146] Don't warn about components not in the pipeline

See here:

https://github.com/explosion/spaCy/discussions/7463

Still need to check if there are any side effects of listeners being
present but not in the pipeline, but this commit will silence the
warnings.
---
 spacy/training/initialize.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index f7f2f21a4..d017aa909 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -74,6 +74,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     for name, proc in nlp.pipeline:
         if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
             for listener in proc.listening_components:
+                # Don't warn about components not in the pipeline
+                if listener not in nlp.pipeline:
+                    continue
+
                 if listener in frozen_components and name not in frozen_components:
                     logger.warning(Warnings.W087.format(name=name, listener=listener))
                 # We always check this regardless, in case user freezes tok2vec

From a5ffe8dfed105b089678353c5517a787c8c4240c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Mar 2021 11:29:57 +0100
Subject: [PATCH 041/146] Add details about pretrained pipeline design

---
 website/docs/models/index.md | 144 +++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)

diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 30b4f11d9..2ca1bf6b3 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -4,6 +4,7 @@ teaser: Downloadable trained pipelines and weights for spaCy
 menu:
   - ['Quickstart', 'quickstart']
   - ['Conventions', 'conventions']
+  - ['Pipeline Design', 'design']
 ---
 
 <!-- TODO: include interactive demo -->
@@ -53,3 +54,146 @@ For a detailed compatibility overview, see the
 [`compatibility.json`](https://github.com/explosion/spacy-models/tree/master/compatibility.json).
 This is also the source of spaCy's internal compatibility check, performed when
 you run the [`download`](/api/cli#download) command.
+
+## Pretrained pipeline design {#design}
+
+The spaCy v3 pretrained pipelines are designed to be efficient and configurable.
+For example, multiple components can share a common "token-to-vector" model and
+it's easy to swap out or disable the lemmatizer. The pipelines are designed to
+be efficient in terms of speed and size and work well when the pipeline is run
+in full.
+
+When modifying a pretrained v3 pipeline, it's important to understand how the
+components **depend on** each other. Unlike spaCy v2, where the `tagger`,
+`parser` and `ner` components were all independent, some v3 components depend on
+earlier components in the pipeline. As a result, disabling or reordering
+components can affect the annotation quality or lead to warnings and errors.
+
+Main changes from spaCy v2 models:
+
+- The [`Tok2Vec`](/api/tok2vec) component may be a separate, shared component. A
+  component like a tagger or parser can
+  [listen](/api/architectures#Tok2VecListener) to an earlier `tok2vec` or
+  `transformer` rather than having its own separate tok2vec layer.
+- Rule-based exceptions move from individual components to the
+  `attribute_ruler`. Lemma and POS exceptions move from the tokenizer exceptions
+  to the attribute ruler and the tag map and morph rules move from the tagger to
+  the attribute ruler.
+- The lemmatizer tables and processing move from the vocab and tagger to a
+  separate `lemmatizer` component.
+
+### CNN/CPU pipeline design
+
+In the `sm`/`md`/`lg` models:
+
+- The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec`
+  component.
+- The `attribute_ruler` maps `token.tag` to `token.pos` if there is no
+  `morphologizer`. The `attribute_ruler` additionally makes sure whitespace is
+  tagged consistently and copies `token.pos` to `token.tag` if there is no
+  tagger. For English, the attribute ruler can improve its mapping from
+  `token.tag` to `token.pos` if dependency parses from a `parser` are present,
+  but the parser is not required.
+- The rule-based `lemmatizer` (Dutch, English, French, Greek, Macedonian,
+  Norwegian and Spanish) requires `token.pos` annotation from either
+  `tagger`+`attribute_ruler` or `morphologizer`.
+- The `ner` component is independent with its own internal tok2vec layer.
+
+<!-- TODO: pretty diagram -->
+
+### Transformer pipeline design
+
+In the tranformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
+all listen to the `transformer` component. The `attribute_ruler` and
+`lemmatizer` have the same configuration as in the CNN models.
+
+<!-- TODO: pretty diagram -->
+
+### Modifying the default pipeline
+
+For faster processing, you may only want to run a subset of the components in a
+pretrained pipeline. The `disable` and `exclude` arguments to
+[`spacy.load`](/api/top-level#spacy.load) let you control which components are
+loaded and run. Disabled components are loaded in the background so it's
+possible to reenable them in the same pipeline in the future with
+[`nlp.enable_pipe`](/api/language/#enable_pipe). To skip loading a component
+completely, use `exclude` instead of `disable`.
+
+#### Disable part-of-speech tagging and lemmatization
+
+To disable part-of-speech tagging and lemmatization, disable the `tagger`,
+`morphologizer`, `attribute_ruler` and `lemmatizer` components.
+
+```python
+# Note: English doesn't include a morphologizer
+nlp = spacy.load("en_core_web_sm", disable=["tagger", "attribute_ruler", "lemmatizer"])
+nlp = spacy.load("en_core_web_trf", disable=["tagger", "attribute_ruler", "lemmatizer"])
+```
+
+<Infobox variant="warning" title="Rule-based lemmatizers require Token.pos">
+
+The lemmatizer depends on `tagger`+`attribute_ruler` or `morphologizer` for
+Dutch, English, French, Greek, Macedonian, Norwegian and Spanish. If you disable
+any of these components, you'll see lemmatizer warnings unless the lemmatizer is
+also disabled.
+
+</Infobox>
+
+#### Use senter rather than parser for fast sentence segmentation
+
+If you need fast sentence segmentation without dependency parses, disable the
+`parser` use the `senter` component instead:
+
+```python
+nlp = spacy.load("en_core_web_sm")
+nlp.disable_pipe("parser")
+nlp.enable_pipe("senter")
+```
+
+The `senter` component is ~10&times; faster than the parser and more accurate
+than the rule-based `sentencizer`.
+
+#### Switch from rule-based to lookup lemmatization
+
+For the Dutch, English, French, Greek, Macedonian, Norwegian and Spanish
+pipelines, you can switch from the default rule-based lemmatizer to a lookup
+lemmatizer:
+
+```python
+# Requirements: pip install spacy-lookups-data
+nlp = spacy.load("en_core_web_sm")
+nlp.remove_pipe("lemmatizer")
+nlp.add_pipe("lemmatizer", config={"mode": "lookup"}).initialize()
+```
+
+#### Disable everything except NER
+
+For the non-transformer models, the `ner` component is independent, so you can
+disable everything else:
+
+```python
+nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
+```
+
+In the transformer models, `ner` listens to the `transformer` layer, so you can
+disable all components related tagging, parsing, and lemmatization.
+
+```python
+nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
+```
+
+#### Move NER to the end of the pipeline
+
+For access to `POS` and `LEMMA` features in an `entity_ruler`, move `ner` to the
+end of the pipeline after `attribute_ruler` and `lemmatizer`:
+
+```python
+# load without NER
+nlp = spacy.load("en_core_web_sm", exclude=["ner"])
+
+# source NER from the same pipeline package as the last component
+nlp.add_pipe("ner", source=spacy.load("en_core_web_sm"))
+
+# insert the entity ruler
+nlp.add_pipe("entity_ruler", before="ner")
+```

From 5da323fd86064046f185b972d87718cbdd41e0ab Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Mar 2021 12:59:05 +0100
Subject: [PATCH 042/146] Minor edits

---
 website/docs/models/index.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 2ca1bf6b3..1d03b4c3d 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -63,7 +63,7 @@ it's easy to swap out or disable the lemmatizer. The pipelines are designed to
 be efficient in terms of speed and size and work well when the pipeline is run
 in full.
 
-When modifying a pretrained v3 pipeline, it's important to understand how the
+When modifying a pretrained pipeline, it's important to understand how the
 components **depend on** each other. Unlike spaCy v2, where the `tagger`,
 `parser` and `ner` components were all independent, some v3 components depend on
 earlier components in the pipeline. As a result, disabling or reordering
@@ -175,8 +175,8 @@ disable everything else:
 nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
 ```
 
-In the transformer models, `ner` listens to the `transformer` layer, so you can
-disable all components related tagging, parsing, and lemmatization.
+In the transformer models, `ner` listens to the `transformer` compoinent, so you
+can disable all components related tagging, parsing, and lemmatization.
 
 ```python
 nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])

From 40bc01e66823c82a5319497ad46675b83bc7878f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 17 Mar 2021 22:41:41 +0900
Subject: [PATCH 043/146] Proactively remove unused listeners

With this the changes in initialize.py might be unecessary.

Requires testing.
---
 spacy/language.py            | 24 +++++++++++++++---------
 spacy/training/initialize.py | 19 +++++++++----------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 871dfafaa..04a5e843e 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1686,15 +1686,21 @@ class Language:
                 )
         # Detect components with listeners that are not frozen consistently
         for name, proc in nlp.pipeline:
-            if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
-                for listener in proc.listening_components:
-                    # If it's a component sourced from another pipeline, we check if
-                    # the tok2vec listeners should be replaced with standalone tok2vec
-                    # models (e.g. so component can be frozen without its performance
-                    # degrading when other components/tok2vec are updated)
-                    paths = sourced.get(listener, {}).get("replace_listeners", [])
-                    if paths:
-                        nlp.replace_listeners(name, listener, paths)
+            # Remove listeners not in the pipeline
+            listener_names = getattr(proc, "listening_components", [])
+            unused_listener_names = [ll for ll in listener_names if ll not in nlp.pipe_names]
+            for listener_name in unused_listener_names:
+                for listener in proc.listener_map.get(listener_name, []):
+                    proc.remove_listener(listener, listener_name)
+
+            for listener in getattr(proc, "listening_components", []):  # e.g. tok2vec/transformer
+                # If it's a component sourced from another pipeline, we check if
+                # the tok2vec listeners should be replaced with standalone tok2vec
+                # models (e.g. so component can be frozen without its performance
+                # degrading when other components/tok2vec are updated)
+                paths = sourced.get(listener, {}).get("replace_listeners", [])
+                if paths:
+                    nlp.replace_listeners(name, listener, paths)
         return nlp
 
     def replace_listeners(
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index d017aa909..f623627eb 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -72,17 +72,16 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
         logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
     # Detect components with listeners that are not frozen consistently
     for name, proc in nlp.pipeline:
-        if getattr(proc, "listening_components", None):  # e.g. tok2vec/transformer
-            for listener in proc.listening_components:
-                # Don't warn about components not in the pipeline
-                if listener not in nlp.pipeline:
-                    continue
+        for listener in getattr(proc, "listening_components", []):  # e.g. tok2vec/transformer
+            # Don't warn about components not in the pipeline
+            if listener not in nlp.pipe_names:
+                continue
 
-                if listener in frozen_components and name not in frozen_components:
-                    logger.warning(Warnings.W087.format(name=name, listener=listener))
-                # We always check this regardless, in case user freezes tok2vec
-                if listener not in frozen_components and name in frozen_components:
-                    logger.warning(Warnings.W086.format(name=name, listener=listener))
+            if listener in frozen_components and name not in frozen_components:
+                logger.warning(Warnings.W087.format(name=name, listener=listener))
+            # We always check this regardless, in case user freezes tok2vec
+            if listener not in frozen_components and name in frozen_components:
+                logger.warning(Warnings.W086.format(name=name, listener=listener))
     return nlp
 
 

From 9fd41d674296bcfffc064cb7bcae8f0b5dcb6880 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Mar 2021 14:54:04 +0100
Subject: [PATCH 044/146] Remove Language.pipe cleanup arg

---
 website/docs/api/language.md | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index a90476dab..ca87cbb16 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -198,7 +198,6 @@ more efficient than processing texts one-by-one.
 | `as_tuples`                                | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
 | `batch_size`                               | The number of texts to buffer. ~~Optional[int]~~                                                                                                                    |
 | `disable`                                  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                                     |
-| `cleanup`                                  | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~                                                                                 |
 | `component_cfg`                            | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                      |
 | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~                                                                                                               |
 | **YIELDS**                                 | Documents in the order of the original text. ~~Doc~~                                                                                                                |
@@ -872,10 +871,10 @@ when loading a config with
 > replace_listeners = ["model.tok2vec"]
 > ```
 
-| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                    |
-| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~                                                                                                                                                                                                                                                                                                                        |
-| `pipe_name`    | Name of pipeline component to replace listeners for. ~~str~~                                                                                                                                                                                                                                                                                                                                                   |
+| Name           | Description                                                                                                                                                                                                                                                                                                                                                                                                                            |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `tok2vec_name` | Name of the token-to-vector component, typically `"tok2vec"` or `"transformer"`.~~str~~                                                                                                                                                                                                                                                                                                                                                |
+| `pipe_name`    | Name of pipeline component to replace listeners for. ~~str~~                                                                                                                                                                                                                                                                                                                                                                           |
 | `listeners`    | The paths to the listeners, relative to the component config, e.g. `["model.tok2vec"]`. Typically, implementations will only connect to one tok2vec component, `model.tok2vec`, but in theory, custom models can use multiple listeners. The value here can either be an empty list to not replace any listeners, or a _complete_ list of the paths to all listener layers used by the model that should be replaced.~~Iterable[str]~~ |
 
 ## Language.meta {#meta tag="property"}

From 83c1b919a7f35452a23a1016fd862e6034107cfb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Mar 2021 14:54:40 +0100
Subject: [PATCH 045/146] Fix positional/option in CLI types

---
 website/docs/api/cli.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 16e84e53f..73a03cba8 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -77,7 +77,7 @@ $ python -m spacy info [model] [--markdown] [--silent] [--exclude]
 
 | Name                                             | Description                                                                                   |
 | ------------------------------------------------ | --------------------------------------------------------------------------------------------- |
-| `model`                                          | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(positional)~~     |
+| `model`                                          | A trained pipeline, i.e. package name or path (optional). ~~Optional[str] \(option)~~         |
 | `--markdown`, `-md`                              | Print information as Markdown. ~~bool (flag)~~                                                |
 | `--silent`, `-s` <Tag variant="new">2.0.12</Tag> | Don't print anything, just return the values. ~~bool (flag)~~                                 |
 | `--exclude`, `-e`                                | Comma-separated keys to exclude from the print-out. Defaults to `"labels"`. ~~Optional[str]~~ |
@@ -259,7 +259,7 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
 | Name                                             | Description                                                                                                                               |
 | ------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `input_file`                                     | Input file. ~~Path (positional)~~                                                                                                         |
-| `output_dir`                                     | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(positional)~~        |
+| `output_dir`                                     | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. ~~Optional[Path] \(option)~~            |
 | `--converter`, `-c` <Tag variant="new">2</Tag>   | Name of converter to use (see below). ~~str (option)~~                                                                                    |
 | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
 | `--n-sents`, `-n`                                | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~                                         |
@@ -642,7 +642,7 @@ $ python -m spacy debug profile [model] [inputs] [--n-texts]
 | Name              | Description                                                                        |
 | ----------------- | ---------------------------------------------------------------------------------- |
 | `model`           | A loadable spaCy pipeline (package name or path). ~~str (positional)~~             |
-| `inputs`          | Optional path to input file, or `-` for standard input. ~~Path (positional)~~      |
+| `inputs`          | Path to input file, or `-` for standard input. ~~Path (positional)~~               |
 | `--n-texts`, `-n` | Maximum number of texts to use if available. Defaults to `10000`. ~~int (option)~~ |
 | `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                         |
 | **PRINTS**        | Profiling information for the pipeline.                                            |
@@ -1191,14 +1191,14 @@ $ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose]
 > $ python -m spacy project dvc all
 > ```
 
-| Name              | Description                                                                                                       |
-| ----------------- | ----------------------------------------------------------------------------------------------------------------- |
-| `project_dir`     | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                           |
-| `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(positional)~~ |
-| `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                       |
-| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~                                                              |
-| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                        |
-| **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                     |
+| Name              | Description                                                                                                   |
+| ----------------- | ------------------------------------------------------------------------------------------------------------- |
+| `project_dir`     | Path to project directory. Defaults to current working directory. ~~Path (positional)~~                       |
+| `workflow`        | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. ~~Optional[str] \(option)~~ |
+| `--force`, `-F`   | Force-updating config file. ~~bool (flag)~~                                                                   |
+| `--verbose`, `-V` |  Print more output generated by DVC. ~~bool (flag)~~                                                          |
+| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                    |
+| **CREATES**       | A `dvc.yaml` file in the project directory, based on the steps defined in the given workflow.                 |
 
 ## ray {#ray new="3"}
 
@@ -1236,7 +1236,7 @@ $ python -m spacy ray train [config_path] [--code] [--output] [--n-workers] [--a
 | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `config_path`       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~                                                                |
 | `--code`, `-c`      | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~       |
-| `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(positional)~~                                           |
+| `--output`, `-o`    | Directory or remote storage URL for saving trained pipeline. The directory will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                               |
 | `--n-workers`, `-n` | The number of workers. Defaults to `1`. ~~int (option)~~                                                                                                                                   |
 | `--address`, `-a`   | Optional address of the Ray cluster. If not set (default), Ray will run locally. ~~Optional[str] \(option)~~                                                                               |
 | `--gpu-id`, `-g`    | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                 |

From 9a254d39956ecee8dd124c6223711732324a35e4 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Mar 2021 15:05:22 +0100
Subject: [PATCH 046/146] Include all en_core_web_sm components in examples

---
 website/docs/usage/processing-pipelines.md | 29 +++++++++++-----------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 909a9c7de..25eaf6558 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -54,9 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."]
 In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a
 (potentially very large) iterable of texts as a stream. Because we're only
 accessing the named entities in `doc.ents` (set by the `ner` component), we'll
-disable all other statistical components (the `tagger` and `parser`) during
-processing. `nlp.pipe` yields `Doc` objects, so we can iterate over them and
-access the named entity predictions:
+disable all other components during processing. `nlp.pipe` yields `Doc`
+objects, so we can iterate over them and access the named entity predictions:
 
 > #### ✏️ Things to try
 >
@@ -73,7 +72,7 @@ texts = [
 ]
 
 nlp = spacy.load("en_core_web_sm")
-for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
+for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
     # Do something with the doc here
     print([(ent.text, ent.label_) for ent in doc.ents])
 ```
@@ -144,10 +143,12 @@ nlp = spacy.load("en_core_web_sm")
 ```
 
 ... the pipeline's `config.cfg` tells spaCy to use the language `"en"` and the
-pipeline `["tok2vec", "tagger", "parser", "ner"]`. spaCy will then initialize
-`spacy.lang.en.English`, and create each pipeline component and add it to the
-processing pipeline. It'll then load in the model data from the data directory
-and return the modified `Language` class for you to use as the `nlp` object.
+pipeline
+`["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]`. spaCy
+will then initialize `spacy.lang.en.English`, and create each pipeline component
+and add it to the processing pipeline. It'll then load in the model data from
+the data directory and return the modified `Language` class for you to use as
+the `nlp` object.
 
 <Infobox title="Changed in v3.0" variant="warning">
 
@@ -171,7 +172,7 @@ the binary data:
 ```python
 ### spacy.load under the hood
 lang = "en"
-pipeline = ["tok2vec", "tagger", "parser", "ner"]
+pipeline = ["tok2vec", "tagger", "parser", "ner", "attribute_ruler", "lemmatizer"]
 data_path = "path/to/en_core_web_sm/en_core_web_sm-3.0.0"
 
 cls = spacy.util.get_lang_class(lang)  # 1. Get Language class, e.g. English
@@ -186,7 +187,7 @@ component** on the `Doc`, in order. Since the model data is loaded, the
 components can access it to assign annotations to the `Doc` object, and
 subsequently to the `Token` and `Span` which are only views of the `Doc`, and
 don't own any data themselves. All components return the modified document,
-which is then processed by the component next in the pipeline.
+which is then processed by the next component in the pipeline.
 
 ```python
 ### The pipeline under the hood
@@ -201,9 +202,9 @@ list of human-readable component names.
 
 ```python
 print(nlp.pipeline)
-# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
+# [('tok2vec', <spacy.pipeline.Tok2Vec>), ('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>), ('attribute_ruler', <spacy.pipeline.AttributeRuler>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer>)]
 print(nlp.pipe_names)
-# ['tok2vec', 'tagger', 'parser', 'ner']
+# ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
 ```
 
 ### Built-in pipeline components {#built-in}
@@ -300,7 +301,7 @@ blocks.
 ```python
 ### Disable for block
 # 1. Use as a context manager
-with nlp.select_pipes(disable=["tagger", "parser"]):
+with nlp.select_pipes(disable=["tagger", "parser", "lemmatizer"]):
     doc = nlp("I won't be tagged and parsed")
 doc = nlp("I will be tagged and parsed")
 
@@ -324,7 +325,7 @@ The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
 argument if you only want to disable components during processing:
 
 ```python
-for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
+for doc in nlp.pipe(texts, disable=["tagger", "parser", "lemmatizer"]):
     # Do something with the doc here
 ```
 

From c9e1a9ac174abe4c8113518955e56af6ea2c5a8d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 17 Mar 2021 21:28:04 +0100
Subject: [PATCH 047/146] Add multiprocessing section

---
 website/docs/usage/processing-pipelines.md | 49 ++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 25eaf6558..9e8e87239 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -91,6 +91,55 @@ have to call `list()` on it first:
 
 </Infobox>
 
+### Multiprocessing
+
+spaCy includes built-in support for multiprocessing with
+[`nlp.pipe`](/api/language#pipe) using the `n_process` option:
+
+```python
+# Multiprocessing with 4 processes
+docs = nlp.pipe(texts, n_process=4)
+
+# With as many processes as CPUs (use with caution!)
+docs = nlp.pipe(texts, n_process=-1)
+```
+
+Depending on your platform, starting many processes with multiprocessing can
+add a lot of overhead. In particular, the default start method `spawn` used in
+macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models
+because the model data is copied in memory for each new process. See the
+[Python docs on
+multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
+for further details.
+
+For shorter tasks and in particular with `spawn`, it can be faster to use a
+smaller number of processes with a larger batch size. The optimal `batch_size`
+setting will depend on the pipeline components, the length of your documents,
+the number of processes and how much memory is available.
+
+```python
+# Default batch size is `nlp.batch_size` (typically 1000)
+docs = nlp.pipe(texts, n_process=2, batch_size=2000)
+```
+
+<Infobox title="Multiprocessing on GPU" variant="warning">
+
+Multiprocessing is not generally recommended on GPU because RAM is too limited.
+If you want to try it out, be aware that it is only possible using `spawn` due
+to limitations in CUDA.
+
+</Infobox>
+
+<Infobox title="Multiprocessing with transformer models" variant="warning">
+
+In Linux, transformer models may hang or deadlock with multiprocessing due to an
+[issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One
+suggested workaround is to use `spawn` instead of `fork` and another is to
+limit the number of threads before loading any models using
+`torch.set_num_threads(1)`.
+
+</Infobox>
+
 ## Pipelines and built-in components {#pipelines}
 
 spaCy makes it very easy to create your own pipelines consisting of reusable

From acc58719da2f0b7584eedc913fd691a8ab0c750f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 18 Mar 2021 12:49:20 +0100
Subject: [PATCH 048/146] Update custom similarity hooks example

---
 website/docs/usage/processing-pipelines.md | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 9e8e87239..836bdac67 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -1547,24 +1547,33 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
 
 | Name               | Customizes                                                                                                                                                                                                              |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `user_hooks`       | [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents)                                                                      |
+| `user_hooks`       | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents)                                                                      |
 | `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) |
 | `user_span_hooks`  | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root)                     |
 
 ```python
 ### Add custom similarity hooks
+from spacy.language import Language
+
+
 class SimilarityModel:
-    def __init__(self, model):
-        self._model = model
+    def __init__(self, name: str, index: int):
+        self.name = name
+        self.index = index
 
     def __call__(self, doc):
         doc.user_hooks["similarity"] = self.similarity
         doc.user_span_hooks["similarity"] = self.similarity
         doc.user_token_hooks["similarity"] = self.similarity
+        return doc
 
     def similarity(self, obj1, obj2):
-        y = self._model([obj1.vector, obj2.vector])
-        return float(y[0])
+        return obj1.vector[self.index] + obj2.vector[self.index]
+
+
+@Language.factory("similarity_component", default_config={"index": 0})
+def create_similarity_component(nlp, name, index: int):
+    return SimilarityModel(name, index)
 ```
 
 ## Developing plugins and wrappers {#plugins}

From 0fb1881f36f68b42b6b096915c153ef189b21ff2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 18 Mar 2021 13:29:51 +0100
Subject: [PATCH 049/146] Reformat processing pipelines

---
 website/docs/usage/processing-pipelines.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 836bdac67..a669bda7d 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -54,8 +54,8 @@ texts = ["This is a text", "These are lots of texts", "..."]
 In this example, we're using [`nlp.pipe`](/api/language#pipe) to process a
 (potentially very large) iterable of texts as a stream. Because we're only
 accessing the named entities in `doc.ents` (set by the `ner` component), we'll
-disable all other components during processing. `nlp.pipe` yields `Doc`
-objects, so we can iterate over them and access the named entity predictions:
+disable all other components during processing. `nlp.pipe` yields `Doc` objects,
+so we can iterate over them and access the named entity predictions:
 
 > #### ✏️ Things to try
 >
@@ -104,12 +104,11 @@ docs = nlp.pipe(texts, n_process=4)
 docs = nlp.pipe(texts, n_process=-1)
 ```
 
-Depending on your platform, starting many processes with multiprocessing can
-add a lot of overhead. In particular, the default start method `spawn` used in
+Depending on your platform, starting many processes with multiprocessing can add
+a lot of overhead. In particular, the default start method `spawn` used in
 macOS/OS X (as of Python 3.8) and in Windows can be slow for larger models
 because the model data is copied in memory for each new process. See the
-[Python docs on
-multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
+[Python docs on multiprocessing](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
 for further details.
 
 For shorter tasks and in particular with `spawn`, it can be faster to use a
@@ -134,8 +133,8 @@ to limitations in CUDA.
 
 In Linux, transformer models may hang or deadlock with multiprocessing due to an
 [issue in PyTorch](https://github.com/pytorch/pytorch/issues/17199). One
-suggested workaround is to use `spawn` instead of `fork` and another is to
-limit the number of threads before loading any models using
+suggested workaround is to use `spawn` instead of `fork` and another is to limit
+the number of threads before loading any models using
 `torch.set_num_threads(1)`.
 
 </Infobox>
@@ -1547,7 +1546,7 @@ to `Doc.user_span_hooks` and `Doc.user_token_hooks`.
 
 | Name               | Customizes                                                                                                                                                                                                              |
 | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `user_hooks`       | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents)                                                                      |
+| `user_hooks`       | [`Doc.similarity`](/api/doc#similarity), [`Doc.vector`](/api/doc#vector), [`Doc.has_vector`](/api/doc#has_vector), [`Doc.vector_norm`](/api/doc#vector_norm), [`Doc.sents`](/api/doc#sents)                             |
 | `user_token_hooks` | [`Token.similarity`](/api/token#similarity), [`Token.vector`](/api/token#vector), [`Token.has_vector`](/api/token#has_vector), [`Token.vector_norm`](/api/token#vector_norm), [`Token.conjuncts`](/api/token#conjuncts) |
 | `user_span_hooks`  | [`Span.similarity`](/api/span#similarity), [`Span.vector`](/api/span#vector), [`Span.has_vector`](/api/span#has_vector), [`Span.vector_norm`](/api/span#vector_norm), [`Span.root`](/api/span#root)                     |
 

From 40e5d3a980886548dd0c692654f00dd26bac519a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 18 Mar 2021 16:56:10 +0100
Subject: [PATCH 050/146] Update saving/loading example

---
 website/docs/usage/saving-loading.md | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index f15493fd7..9dad077e7 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -19,9 +19,8 @@ import Serialization101 from 'usage/101/\_serialization.md'
 When serializing the pipeline, keep in mind that this will only save out the
 **binary data for the individual components** to allow spaCy to restore them –
 not the entire objects. This is a good thing, because it makes serialization
-safe. But it also means that you have to take care of storing the language name
-and pipeline component names as well, and restoring them separately before you
-can load in the data.
+safe. But it also means that you have to take care of storing the config, which
+contains the pipeline configuration and all the relevant settings.
 
 > #### Saving the meta and config
 >
@@ -33,24 +32,21 @@ can load in the data.
 
 ```python
 ### Serialize
+config = nlp.config
 bytes_data = nlp.to_bytes()
-lang = nlp.config["nlp"]["lang"]  # "en"
-pipeline = nlp.config["nlp"]["pipeline"]  # ["tagger", "parser", "ner"]
 ```
 
 ```python
 ### Deserialize
-nlp = spacy.blank(lang)
-for pipe_name in pipeline:
-    nlp.add_pipe(pipe_name)
+lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
+nlp = lang_cls.from_config(config)
 nlp.from_bytes(bytes_data)
 ```
 
 This is also how spaCy does it under the hood when loading a pipeline: it loads
 the `config.cfg` containing the language and pipeline information, initializes
-the language class, creates and adds the pipeline components based on the
-defined [factories](/usage/processing-pipeline#custom-components-factories) and
-_then_ loads in the binary data. You can read more about this process
+the language class, creates and adds the pipeline components based on the config
+and _then_ loads in the binary data. You can read more about this process
 [here](/usage/processing-pipelines#pipelines).
 
 ## Serializing Doc objects efficiently {#docs new="2.2"}

From 6354b642c5dcc806e4704f9b0caa0c0fe2543e13 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 18 Mar 2021 19:01:10 +0100
Subject: [PATCH 051/146] Fix typo

---
 website/docs/models/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 1d03b4c3d..d37e9471d 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -175,7 +175,7 @@ disable everything else:
 nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])
 ```
 
-In the transformer models, `ner` listens to the `transformer` compoinent, so you
+In the transformer models, `ner` listens to the `transformer` component, so you
 can disable all components related tagging, parsing, and lemmatization.
 
 ```python

From 3c362ac5209d6e5a2f220a0181738c0c3b992d41 Mon Sep 17 00:00:00 2001
From: Lukas Winkler <git@lw1.at>
Date: Thu, 18 Mar 2021 21:09:11 +0100
Subject: [PATCH 052/146] replace "is not" with !=

---
 spacy/training/pretrain.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/pretrain.py b/spacy/training/pretrain.py
index c791732db..6d7850212 100644
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@@ -142,7 +142,7 @@ def create_pretraining_model(nlp, pretrain_config):
     # If the config referred to a Tok2VecListener, grab the original model instead
     if type(tok2vec).__name__ == "Tok2VecListener":
         original_tok2vec = (
-            tok2vec.upstream_name if tok2vec.upstream_name is not "*" else "tok2vec"
+            tok2vec.upstream_name if tok2vec.upstream_name != "*" else "tok2vec"
         )
         tok2vec = nlp.get_pipe(original_tok2vec).model
     try:

From 0ad9e16ec3524826e1da93043bd9bfdacaebd634 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 18 Mar 2021 21:18:25 +0100
Subject: [PATCH 053/146] Check for callbacks entry points

---
 spacy/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/util.py b/spacy/util.py
index 4b82eea8d..389e3504f 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -88,7 +88,7 @@ class registry(thinc.registry):
     displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
     misc = catalogue.create("spacy", "misc", entry_points=True)
     # Callback functions used to manipulate nlp object etc.
-    callbacks = catalogue.create("spacy", "callbacks")
+    callbacks = catalogue.create("spacy", "callbacks", entry_points=True)
     batchers = catalogue.create("spacy", "batchers", entry_points=True)
     readers = catalogue.create("spacy", "readers", entry_points=True)
     augmenters = catalogue.create("spacy", "augmenters", entry_points=True)

From 6a9a46776661c32ae9a95d9abddf81f7b905a118 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 19 Mar 2021 08:12:49 +0100
Subject: [PATCH 054/146] Update website/docs/usage/processing-pipelines.md

Co-authored-by: Ines Montani <ines@ines.io>
---
 website/docs/usage/processing-pipelines.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index a669bda7d..52568658d 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -91,7 +91,7 @@ have to call `list()` on it first:
 
 </Infobox>
 
-### Multiprocessing
+### Multiprocessing {#multiprocessing}
 
 spaCy includes built-in support for multiprocessing with
 [`nlp.pipe`](/api/language#pipe) using the `n_process` option:

From 48b90c8e1cf3942862ebc14e61842148060fa784 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 19 Mar 2021 09:43:52 +0100
Subject: [PATCH 055/146] Update deprecated doc.is_sentenced in Corpus

---
 spacy/training/corpus.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index ae7b89f15..079b872d6 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -155,7 +155,7 @@ class Corpus:
                 continue
             elif self.max_length == 0 or len(reference) < self.max_length:
                 yield self._make_example(nlp, reference, False)
-            elif reference.is_sentenced:
+            elif reference.has_annotation("SENT_START"):
                 for ref_sent in reference.sents:
                     if len(ref_sent) == 0:
                         continue
@@ -166,7 +166,7 @@ class Corpus:
         self, nlp: "Language", reference_docs: Iterable[Doc]
     ) -> Iterator[Example]:
         for reference in reference_docs:
-            if reference.is_sentenced:
+            if reference.has_annotation("SENT_START"):
                 ref_sents = [sent.as_doc() for sent in reference.sents]
             else:
                 ref_sents = [reference]

From c771ec22f05385f0089eeb724a44ff3ec0a7815d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 19 Mar 2021 10:11:10 +0100
Subject: [PATCH 056/146] Update matcher errors and docs

* Mention `tagger+attribute_ruler` in `POS`/`MORPH` error messages for
`Matcher` and `PhraseMatcher`
* Document `Matcher.__call__(allow_missing=)`
---
 spacy/matcher/matcher.pyx       |  4 +++-
 spacy/matcher/phrasematcher.pyx |  2 +-
 website/docs/api/matcher.md     | 13 +++++++------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index ec5d72f9e..26dca05eb 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -202,6 +202,8 @@ cdef class Matcher:
         doclike (Doc or Span): The document to match over.
         as_spans (bool): Return Span objects with labels instead of (match_id,
             start, end) tuples.
+        allow_missing (bool): Whether to skip checks for missing annotation for
+            attributes included in patterns. Defaults to False.
         RETURNS (list): A list of `(match_id, start, end)` tuples,
             describing the matches. A match tuple describes a span
             `doc[start:end]`. The `match_id` is an integer. If as_spans is set
@@ -222,7 +224,7 @@ cdef class Matcher:
                     if attr == TAG:
                         pipe = "tagger"
                     elif attr in (POS, MORPH):
-                        pipe = "morphologizer"
+                        pipe = "morphologizer or tagger+attribute_ruler"
                     elif attr == LEMMA:
                         pipe = "lemmatizer"
                     elif attr == DEP:
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 088456b9a..e5ff2202c 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -194,7 +194,7 @@ cdef class PhraseMatcher:
                         if attr == TAG:
                             pipe = "tagger"
                         elif attr in (POS, MORPH):
-                            pipe = "morphologizer"
+                            pipe = "morphologizer or tagger+attribute_ruler"
                         elif attr == LEMMA:
                             pipe = "lemmatizer"
                         elif attr == DEP:
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 7c39d9caf..95a76586a 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -120,12 +120,13 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > matches = matcher(doc)
 > ```
 
-| Name                                  | Description                                                                                                                                                                                                                                                                                              |
-| ------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doclike`                             | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
-| _keyword-only_                        |                                                                                                                                                                                                                                                                                                          |
-| `as_spans` <Tag variant="new">3</Tag> | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
-| **RETURNS**                           | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
+| Name                                       | Description                                                                                                                                                                                                                                                                                              |
+| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doclike`                                  | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
+| _keyword-only_                             |                                                                                                                                                                                                                                                                                                          |
+| `as_spans` <Tag variant="new">3</Tag>      | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
+| **RETURNS**                                | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
 
 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
 

From 39153ef90f54fc92f96673789a719a4e0a8fc0b3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 19 Mar 2021 10:45:16 +0100
Subject: [PATCH 057/146] Update lexeme_norm checks

* Add util method for check
* Add new languages to list with lexeme norm tables
* Add check to all relevant components
* Add config details to warning message

Note that we're not actually inspecting the model config to see if
`NORM` is used as an attribute, so it may warn in cases where it's not
relevant.
---
 spacy/errors.py                      |  9 +++++++--
 spacy/pipeline/morphologizer.pyx     |  1 +
 spacy/pipeline/senter.pyx            |  1 +
 spacy/pipeline/tagger.pyx            |  1 +
 spacy/pipeline/transition_parser.pyx |  5 +----
 spacy/util.py                        | 26 ++++++++++++++++++++------
 6 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index 4f9e90b57..d8c5cc3a8 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -73,8 +73,13 @@ class Warnings:
             "degree. If this is intentional or the language you're using "
             "doesn't have a normalization table, please ignore this warning. "
             "If this is surprising, make sure you have the spacy-lookups-data "
-            "package installed. The languages with lexeme normalization tables "
-            "are currently: {langs}")
+            "package installed and load the table in your config. The "
+            "languages with lexeme normalization tables are currently: "
+            "{langs}\n\nLoad the table in your config with:\n\n"
+            "[initialize.lookups]\n"
+            "@misc = \"spacy.LookupsDataLoader.v1\"\n"
+            "lang = ${{nlp.lang}}\n"
+            "tables = [\"lexeme_norm\"]\n")
     W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
             "attribute or operator.")
 
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 38da71ec7..cd0081346 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -137,6 +137,7 @@ class Morphologizer(Tagger):
         DOCS: https://spacy.io/api/morphologizer#initialize
         """
         validate_get_examples(get_examples, "Morphologizer.initialize")
+        util.check_lexeme_norms(self.vocab, "morphologizer")
         if labels is not None:
             self.cfg["labels_morph"] = labels["morph"]
             self.cfg["labels_pos"] = labels["pos"]
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index c03ec0462..83cd06739 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -138,6 +138,7 @@ class SentenceRecognizer(Tagger):
         DOCS: https://spacy.io/api/sentencerecognizer#initialize
         """
         validate_get_examples(get_examples, "SentenceRecognizer.initialize")
+        util.check_lexeme_norms(self.vocab, "senter")
         doc_sample = []
         label_sample = []
         assert self.labels, Errors.E924.format(name=self.name)
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 204308dcc..9af5245c1 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -249,6 +249,7 @@ class Tagger(TrainablePipe):
         DOCS: https://spacy.io/api/tagger#initialize
         """
         validate_get_examples(get_examples, "Tagger.initialize")
+        util.check_lexeme_norms(self.vocab, "tagger")
         if labels is not None:
             for tag in labels:
                 self.add_label(tag)
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 816870a3e..4de57d311 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -493,10 +493,7 @@ cdef class Parser(TrainablePipe):
 
     def initialize(self, get_examples, nlp=None, labels=None):
         validate_get_examples(get_examples, "Parser.initialize")
-        lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
-        if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
-            langs = ", ".join(util.LEXEME_NORM_LANGS)
-            util.logger.debug(Warnings.W033.format(model="parser or NER", langs=langs))
+        util.check_lexeme_norms(self.vocab, "parser or NER")
         if labels is not None:
             actions = dict(labels)
         else:
diff --git a/spacy/util.py b/spacy/util.py
index 389e3504f..9915de935 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -59,7 +59,7 @@ if TYPE_CHECKING:
 
 OOV_RANK = numpy.iinfo(numpy.uint64).max
 DEFAULT_OOV_PROB = -20
-LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta", "th"]
+LEXEME_NORM_LANGS = ["cs", "da", "de", "el", "en", "id", "lb", "mk", "pt", "ru", "sr", "ta", "th"]
 
 # Default order of sections in the config.cfg. Not all sections needs to exist,
 # and additional sections are added at the end, in alphabetical order.
@@ -70,7 +70,9 @@ CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "co
 
 logger = logging.getLogger("spacy")
 logger_stream_handler = logging.StreamHandler()
-logger_stream_handler.setFormatter(logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s"))
+logger_stream_handler.setFormatter(
+    logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
+)
 logger.addHandler(logger_stream_handler)
 
 
@@ -1454,10 +1456,13 @@ def is_cython_func(func: Callable) -> bool:
     if hasattr(func, attr):  # function or class instance
         return True
     # https://stackoverflow.com/a/55767059
-    if hasattr(func, "__qualname__") and hasattr(func, "__module__") \
-        and func.__module__ in sys.modules:  # method
-            cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
-            return hasattr(cls_func, attr)
+    if (
+        hasattr(func, "__qualname__")
+        and hasattr(func, "__module__")
+        and func.__module__ in sys.modules
+    ):  # method
+        cls_func = vars(sys.modules[func.__module__])[func.__qualname__.split(".")[0]]
+        return hasattr(cls_func, attr)
     return False
 
 
@@ -1508,7 +1513,16 @@ def warn_if_jupyter_cupy():
     """
     if is_in_jupyter():
         from thinc.backends.cupy_ops import CupyOps
+
         if CupyOps.xp is not None:
             from thinc.backends import contextvars_eq_thread_ops
+
             if not contextvars_eq_thread_ops():
                 warnings.warn(Warnings.W111)
+
+
+def check_lexeme_norms(vocab, component_name):
+    lexeme_norms = vocab.lookups.get_table("lexeme_norm", {})
+    if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
+        langs = ", ".join(LEXEME_NORM_LANGS)
+        logger.debug(Warnings.W033.format(model=component_name, langs=langs))

From e39c0dcf336170ee0358f1c86a7146dcc54862c4 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Sat, 20 Mar 2021 18:40:00 +0900
Subject: [PATCH 058/146] Fix mismatched backtick in Lexeme docs

---
 website/docs/api/lexeme.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index c1837fd05..c99f19482 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -133,8 +133,8 @@ The L2 norm of the lexeme's vector representation.
 | `norm_`                                      | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~                                                                                                                                                                                               |
 | `lower`                                      | Lowercase form of the word. ~~int~~                                                                                                                                                                                                                                  |
 | `lower_`                                     | Lowercase form of the word. ~~str~~                                                                                                                                                                                                                                  |
-| `shape`                                      | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
-| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~  |
+| `shape`                                      | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
+| `shape_`                                     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~  |
 | `prefix`                                     | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~                                                                                                                                                                                            |
 | `prefix_`                                    | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~                                                                                                                                                                                            |
 | `suffix`                                     | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~                                                                                                                                                                                              |

From 0d2b723e8d1ae02dcdf06500188f06172b098420 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Sat, 20 Mar 2021 11:38:55 +0100
Subject: [PATCH 059/146] Update entity setting section

---
 website/docs/usage/linguistic-features.md | 26 ++++++++++++++++-------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index fd76c6e4d..40ea2bf9c 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -599,18 +599,27 @@ ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
 print('Before', ents)
 # The model didn't recognize "fb" as an entity :(
 
-fb_ent = Span(doc, 0, 1, label="ORG") # create a Span for the new entity
+# Create a span for the new entity
+fb_ent = Span(doc, 0, 1, label="ORG")
+
+# Option 1: Modify the provided entity spans, leaving the rest unmodified
+doc.set_ents([fb_ent], default="unmodified")
+
+# Option 2: Assign a complete list of ents to doc.ents
 doc.ents = list(doc.ents) + [fb_ent]
 
-ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
+ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
 print('After', ents)
-# [('fb', 0, 2, 'ORG')] 🎉
+# [('fb', 0, 1, 'ORG')] 🎉
 ```
 
-Keep in mind that you need to create a `Span` with the start and end index of
-the **token**, not the start and end index of the entity in the document. In
-this case, "fb" is token `(0, 1)` – but at the document level, the entity will
-have the start and end indices `(0, 2)`.
+Keep in mind that `Span` is initialized with the start and end **token**
+indices, not the character offsets. To create a span from character offsets, use
+[`Doc.char_span`](/api/doc#char_span):
+
+```python
+fb_ent = doc.char_span(0, 2, label="ORG")
+```
 
 #### Setting entity annotations from array {#setting-from-array}
 
@@ -645,9 +654,10 @@ write efficient native code.
 
 ```python
 # cython: infer_types=True
+from spacy.typedefs cimport attr_t
 from spacy.tokens.doc cimport Doc
 
-cpdef set_entity(Doc doc, int start, int end, int ent_type):
+cpdef set_entity(Doc doc, int start, int end, attr_t ent_type):
     for i in range(start, end):
         doc.c[i].ent_type = ent_type
     doc.c[start].ent_iob = 3

From cdab341a75b5932002b56689b88681f38b391fa3 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 23 Mar 2021 11:50:35 +0900
Subject: [PATCH 060/146] Remove mention of -1 for early stopping (fix #7535)

Maybe this used to work differently, but currently a negative patience
just causes immediate termination.
---
 spacy/default_config.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 0f7226083..42081f410 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -68,7 +68,7 @@ seed = ${system.seed}
 gpu_allocator = ${system.gpu_allocator}
 dropout = 0.1
 accumulate_gradient = 1
-# Controls early-stopping. 0 or -1 mean unlimited.
+# Controls early-stopping. 0 disables early stopping.
 patience = 1600
 max_epochs = 0
 max_steps = 20000

From d59f968d0837fe7bda04bb85e26f05aedbbd2133 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Mar 2021 13:32:00 +0200
Subject: [PATCH 061/146] Keep sent starts without parse in retokenization
 (#7424)

In the retokenizer, only reset sent starts (with
`set_children_from_head`) if the doc is parsed. If there is no parse,
merged tokens have the unset `token.is_sent_start == None` by default after
retokenization.
---
 spacy/tests/doc/test_retokenize_merge.py | 27 ++++++++++++++++++++++++
 spacy/tokens/_retokenize.pyx             |  6 ++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py
index 48cd33890..36fa3c15d 100644
--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@@ -452,3 +452,30 @@ def test_retokenize_disallow_zero_length(en_vocab):
     with pytest.raises(ValueError):
         with doc.retokenize() as retokenizer:
             retokenizer.merge(doc[1:1])
+
+
+def test_doc_retokenize_merge_without_parse_keeps_sents(en_tokenizer):
+    text = "displaCy is a parse tool built with Javascript"
+    sent_starts = [1, 0, 0, 0, 1, 0, 0, 0]
+    tokens = en_tokenizer(text)
+
+    # merging within a sentence keeps all sentence boundaries
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
+    assert len(list(doc.sents)) == 2
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[1:3])
+    assert len(list(doc.sents)) == 2
+
+    # merging over a sentence boundary unsets it by default
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
+    assert len(list(doc.sents)) == 2
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[3:6])
+    assert doc[3].is_sent_start == None
+
+    # merging over a sentence boundary and setting sent_start
+    doc = Doc(tokens.vocab, words=[t.text for t in tokens], sent_starts=sent_starts)
+    assert len(list(doc.sents)) == 2
+    with doc.retokenize() as retokenizer:
+        retokenizer.merge(doc[3:6], attrs={"sent_start": True})
+    assert len(list(doc.sents)) == 2
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 3cb2965a9..5c7523667 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -281,7 +281,8 @@ def _merge(Doc doc, merges):
     for i in range(doc.length):
         doc.c[i].head -= i
     # Set the left/right children, left/right edges
-    set_children_from_heads(doc.c, 0, doc.length)
+    if doc.has_annotation("DEP"):
+        set_children_from_heads(doc.c, 0, doc.length)
     # Make sure ent_iob remains consistent
     make_iob_consistent(doc.c, doc.length)
     # Return the merged Python object
@@ -392,7 +393,8 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     for i in range(doc.length):
         doc.c[i].head -= i
     # set children from head
-    set_children_from_heads(doc.c, 0, doc.length)
+    if doc.has_annotation("DEP"):
+        set_children_from_heads(doc.c, 0, doc.length)
 
 
 def _validate_extensions(extensions):

From 139f655f344cff20d9535455a4b72f7a76c90748 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Mar 2021 13:34:01 +0200
Subject: [PATCH 062/146] Merge doc.spans in Doc.from_docs() (#7497)

Merge data from `doc.spans` in `Doc.from_docs()`.

* Fix internal character offset set when merging empty docs (only
affects tokens and spans in `user_data` if an empty doc is in the list
of docs)
---
 spacy/errors.py                 |  3 +++
 spacy/tests/doc/test_doc_api.py |  9 +++++++++
 spacy/tokens/doc.pyx            | 30 ++++++++++++++++++++++++++++--
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index d8c5cc3a8..289d2cfed 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -497,6 +497,9 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
+            "'{text}'. This is likely a bug in spaCy, so feel free to open an "
+            "issue: https://github.com/explosion/spaCy/issues")
     E874 = ("Could not initialize the tok2vec model from component "
             "'{component}' and layer '{layer}'.")
     E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index c27139d2f..0b915513f 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -352,6 +352,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_texts_without_empty = [t for t in en_texts if len(t)]
     de_text = "Wie war die Frage?"
     en_docs = [en_tokenizer(text) for text in en_texts]
+    en_docs[0].spans["group"] = [en_docs[0][1:4]]
+    en_docs[2].spans["group"] = [en_docs[2][1:4]]
+    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
     docs_idx = en_texts[0].index("docs")
     de_doc = de_tokenizer(de_text)
     expected = (True, None, None, None)
@@ -377,6 +380,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
         # not callable, because it was not set via set_extension
         m_doc[2]._.is_ambiguous
     assert len(m_doc.user_data) == len(en_docs[0].user_data)  # but it's there
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
     m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
@@ -388,6 +393,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert len(m_doc) == len(en_docs_tokens)
     think_idx = len(en_texts[0]) + 0 + en_texts[2].index("think")
     assert m_doc[9].idx == think_idx
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
     assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
@@ -399,6 +406,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert len(m_doc) == len(en_docs_tokens)
     think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
     assert m_doc[9].idx == think_idx
+    assert "group" in m_doc.spans
+    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
 
 def test_doc_api_from_docs_ents(en_tokenizer):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 850036483..69f900297 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -6,7 +6,7 @@ from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 
 import copy
-from collections import Counter
+from collections import Counter, defaultdict
 from enum import Enum
 import itertools
 import numpy
@@ -1120,6 +1120,7 @@ cdef class Doc:
         concat_words = []
         concat_spaces = []
         concat_user_data = {}
+        concat_spans = defaultdict(list)
         char_offset = 0
         for doc in docs:
             concat_words.extend(t.text for t in doc)
@@ -1137,8 +1138,17 @@ cdef class Doc:
                         warnings.warn(Warnings.W101.format(name=name))
                 else:
                     warnings.warn(Warnings.W102.format(key=key, value=value))
+            for key in doc.spans:
+                for span in doc.spans[key]:
+                    concat_spans[key].append((
+                        span.start_char + char_offset,
+                        span.end_char + char_offset,
+                        span.label,
+                        span.kb_id,
+                        span.text, # included as a check
+                    ))
             char_offset += len(doc.text)
-            if ensure_whitespace and not (len(doc) > 0 and doc[-1].is_space):
+            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
                 char_offset += 1
 
         arrays = [doc.to_array(attrs) for doc in docs]
@@ -1160,6 +1170,22 @@ cdef class Doc:
 
         concat_doc.from_array(attrs, concat_array)
 
+        for key in concat_spans:
+            if key not in concat_doc.spans:
+                concat_doc.spans[key] = []
+            for span_tuple in concat_spans[key]:
+                span = concat_doc.char_span(
+                        span_tuple[0],
+                        span_tuple[1],
+                        label=span_tuple[2],
+                        kb_id=span_tuple[3],
+                )
+                text = span_tuple[4]
+                if span is not None and span.text == text:
+                    concat_doc.spans[key].append(span)
+                else:
+                    raise ValueError(Errors.E873.format(key=key, text=text))
+
         return concat_doc
 
     def get_lca_matrix(self):

From 3ae86610851003eb8cd2fa70bb8f92b1492002a1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 29 Mar 2021 13:34:48 +0200
Subject: [PATCH 063/146] Fix tensor retokenization for non-numpy ops (#7527)

Implement manual `append` and `delete` for non-numpy ops.
---
 spacy/tokens/_retokenize.pyx | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index 5c7523667..43e6d4aa7 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -295,7 +295,19 @@ def _resize_tensor(tensor, ranges):
         for i in range(start, end-1):
             delete.append(i)
     xp = get_array_module(tensor)
-    return xp.delete(tensor, delete, axis=0)
+    if xp is numpy:
+        return xp.delete(tensor, delete, axis=0)
+    else:
+        offset = 0
+        copy_start = 0
+        resized_shape = (tensor.shape[0] - len(delete), tensor.shape[1])
+        for start, end in ranges:
+            if copy_start > 0:
+                tensor[copy_start - offset:start - offset] = tensor[copy_start: start]
+            offset += end - start - 1
+            copy_start = end - 1
+        tensor[copy_start - offset:resized_shape[0]] = tensor[copy_start:]
+        return xp.asarray(tensor[:resized_shape[0]])
 
 
 def _split(Doc doc, int token_index, orths, heads, attrs):
@@ -332,7 +344,13 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
     to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
     if to_process_tensor:
         xp = get_array_module(doc.tensor)
-        doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
+        if xp is numpy:
+            doc.tensor = xp.append(doc.tensor, xp.zeros((nb_subtokens,doc.tensor.shape[1]), dtype="float32"), axis=0)
+        else:
+            shape = (doc.tensor.shape[0] + nb_subtokens, doc.tensor.shape[1])
+            resized_array = xp.zeros(shape, dtype="float32")
+            resized_array[:doc.tensor.shape[0]] = doc.tensor[:doc.tensor.shape[0]]
+            doc.tensor = resized_array
     for token_to_move in range(orig_length - 1, token_index, -1):
         doc.c[token_to_move + nb_subtokens - 1] = doc.c[token_to_move]
         if to_process_tensor:
@@ -349,7 +367,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
         token.norm = 0  # reset norm
         if to_process_tensor:
             # setting the tensors of the split tokens to array of zeros
-            doc.tensor[token_index + i] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
+            doc.tensor[token_index + i:token_index + i + 1] = xp.zeros((1,doc.tensor.shape[1]), dtype="float32")
         # Update the character offset of the subtokens
         if i != 0:
             token.idx = orig_token.idx + idx_offset

From 5b4dde38a39a899ab2a16e5178078665d305c5f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Abella=20Bascar=C3=A1n?=
 <alvaroabascar@gmail.com>
Date: Tue, 30 Mar 2021 09:45:49 +0200
Subject: [PATCH 064/146] fix fn name: tokenizer.infixes_finditer ->
 tokenizer.infix_finditer (#7606)

---
 website/docs/usage/linguistic-features.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 40ea2bf9c..2d3390049 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -962,7 +962,7 @@ domain. There are six things you may need to define:
    quotes, open brackets, etc.
 3. A function `suffix_search`, to handle **succeeding punctuation**, such as
    commas, periods, close quotes, etc.
-4. A function `infixes_finditer`, to handle non-whitespace separators, such as
+4. A function `infix_finditer`, to handle non-whitespace separators, such as
    hyphens etc.
 5. An optional boolean function `token_match` matching strings that should never
    be split, overriding the infix rules. Useful for things like numbers.

From af07fc3bc1abc07a00f7860addbdabf4523ad2bf Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Tue, 30 Mar 2021 03:47:33 -0400
Subject: [PATCH 065/146] Add support for CUDA 11.2 (#7583)

* Add support for CUDA 11.2

* Update the docs

* Format

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 setup.cfg                   | 2 ++
 website/docs/usage/index.md | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 09f989c54..e928e90a6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -91,6 +91,8 @@ cuda110 =
     cupy-cuda110>=5.0.0b4,<9.0.0
 cuda111 =
     cupy-cuda111>=5.0.0b4,<9.0.0
+cuda112 =
+    cupy-cuda112>=5.0.0b4,<9.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.4.9
diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md
index cbbda2e4f..665d334f8 100644
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@@ -130,9 +130,9 @@ which provides a numpy-compatible interface for GPU arrays.
 
 spaCy can be installed on GPU by specifying `spacy[cuda]`, `spacy[cuda90]`,
 `spacy[cuda91]`, `spacy[cuda92]`, `spacy[cuda100]`, `spacy[cuda101]`,
-`spacy[cuda102]`, `spacy[cuda110]` or `spacy[cuda111]`. If you know your cuda
-version, using the more explicit specifier allows cupy to be installed via
-wheel, saving some compilation time. The specifiers should install
+`spacy[cuda102]`, `spacy[cuda110]`, `spacy[cuda111]` or `spacy[cuda112]`. If you
+know your cuda version, using the more explicit specifier allows cupy to be
+installed via wheel, saving some compilation time. The specifiers should install
 [`cupy`](https://cupy.chainer.org).
 
 ```bash

From 27a48f28025c0cb3e132c715baedb10304121365 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Mar 2021 09:49:12 +0200
Subject: [PATCH 066/146] Fix/update extension copying in Span.as_doc and
 Doc.from_docs (#7574)

* Adjust custom extension data when copying user data in `Span.as_doc()`
* Restrict `Doc.from_docs()` to adjusting offsets for custom extension
data
  * Update test to use extension
  * (Duplicate bug fix for character offset from #7497)
---
 spacy/tests/doc/test_doc_api.py | 18 ++++++++++--------
 spacy/tests/doc/test_span.py    | 13 ++++++++++++-
 spacy/tokens/doc.pyx            |  2 +-
 spacy/tokens/span.pyx           | 15 ++++++++++++++-
 4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 0b915513f..d7452a802 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -6,12 +6,14 @@ import logging
 import mock
 
 from spacy.lang.xx import MultiLanguage
-from spacy.tokens import Doc, Span
+from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy.lexeme import Lexeme
 from spacy.lang.en import English
 from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH
 
+from .test_underscore import clean_underscore  # noqa: F401
+
 
 def test_doc_api_init(en_vocab):
     words = ["a", "b", "c", "d"]
@@ -347,6 +349,7 @@ def test_doc_from_array_morph(en_vocab):
     assert [str(t.morph) for t in doc] == [str(t.morph) for t in new_doc]
 
 
+@pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
     en_texts_without_empty = [t for t in en_texts if len(t)]
@@ -355,10 +358,10 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs[0].spans["group"] = [en_docs[0][1:4]]
     en_docs[2].spans["group"] = [en_docs[2][1:4]]
     span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
-    docs_idx = en_texts[0].index("docs")
     de_doc = de_tokenizer(de_text)
-    expected = (True, None, None, None)
-    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = expected
+    Token.set_extension("is_ambiguous", default=False)
+    en_docs[0][2]._.is_ambiguous = True # docs
+    en_docs[2][3]._.is_ambiguous = True # think
     assert Doc.from_docs([]) is None
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
@@ -375,11 +378,10 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     en_docs_tokens = [t for doc in en_docs for t in doc]
     assert len(m_doc) == len(en_docs_tokens)
     think_idx = len(en_texts[0]) + 1 + en_texts[2].index("think")
+    assert m_doc[2]._.is_ambiguous == True
     assert m_doc[9].idx == think_idx
-    with pytest.raises(AttributeError):
-        # not callable, because it was not set via set_extension
-        m_doc[2]._.is_ambiguous
-    assert len(m_doc.user_data) == len(en_docs[0].user_data)  # but it's there
+    assert m_doc[9]._.is_ambiguous == True
+    assert not any([t._.is_ambiguous for t in m_doc[3:8]])
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 078cc81b1..6a5689971 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -1,9 +1,11 @@
 import pytest
 from spacy.attrs import ORTH, LENGTH
-from spacy.tokens import Doc, Span
+from spacy.tokens import Doc, Span, Token
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
 
+from .test_underscore import clean_underscore  # noqa: F401
+
 
 @pytest.fixture
 def doc(en_tokenizer):
@@ -219,11 +221,14 @@ def test_span_as_doc(doc):
     assert span_doc[0].idx == 0
 
 
+@pytest.mark.usefixtures("clean_underscore")
 def test_span_as_doc_user_data(doc):
     """Test that the user_data can be preserved (but not by default). """
     my_key = "my_info"
     my_value = 342
     doc.user_data[my_key] = my_value
+    Token.set_extension("is_x", default=False)
+    doc[7]._.is_x = True
 
     span = doc[4:10]
     span_doc_with = span.as_doc(copy_user_data=True)
@@ -232,6 +237,12 @@ def test_span_as_doc_user_data(doc):
     assert doc.user_data.get(my_key, None) is my_value
     assert span_doc_with.user_data.get(my_key, None) is my_value
     assert span_doc_without.user_data.get(my_key, None) is None
+    for i in range(len(span_doc_with)):
+        if i != 3:
+            assert span_doc_with[i]._.is_x is False
+        else:
+            assert span_doc_with[i]._.is_x is True
+    assert not any([t._.is_x for t in span_doc_without])
 
 
 def test_span_string_label_kb_id(doc):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 69f900297..aae0ff374 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1127,7 +1127,7 @@ cdef class Doc:
             concat_spaces.extend(bool(t.whitespace_) for t in doc)
 
             for key, value in doc.user_data.items():
-                if isinstance(key, tuple) and len(key) == 4:
+                if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
                     data_type, name, start, end = key
                     if start is not None or end is not None:
                         start += char_offset
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 06d86d2ac..614d8fda5 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -6,6 +6,7 @@ from libc.math cimport sqrt
 import numpy
 from thinc.api import get_array_module
 import warnings
+import copy
 
 from .doc cimport token_by_start, token_by_end, get_token_attr, _get_lca_matrix
 from ..structs cimport TokenC, LexemeC
@@ -241,7 +242,19 @@ cdef class Span:
                 if cat_start == self.start_char and cat_end == self.end_char:
                     doc.cats[cat_label] = value
         if copy_user_data:
-            doc.user_data = self.doc.user_data
+            user_data = {}
+            char_offset = self.start_char
+            for key, value in self.doc.user_data.items():
+                if isinstance(key, tuple) and len(key) == 4 and key[0] == "._.":
+                    data_type, name, start, end = key
+                    if start is not None or end is not None:
+                        start -= char_offset
+                        if end is not None:
+                            end -= char_offset
+                        user_data[(data_type, name, start, end)] = copy.copy(value)
+                else:
+                    user_data[key] = copy.copy(value)
+            doc.user_data = user_data
         return doc
 
     def _fix_dep_copy(self, attrs, array):

From 921feee0927586dc0ffcbc69716c2ef3052f232a Mon Sep 17 00:00:00 2001
From: m0canu1 <47427354+m0canu1@users.noreply.github.com>
Date: Tue, 30 Mar 2021 10:23:32 +0200
Subject: [PATCH 067/146] =?UTF-8?q?Added=20more=20exception=20to=20the=20i?=
 =?UTF-8?q?talian=20language=20from=20https://forum.wordr=E2=80=A6=20(#724?=
 =?UTF-8?q?6)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Added more exception to the italian language from https://forum.wordreference.com/threads/le-abbreviazioni-nella-lingua-italiana-abbreviations-in-italian.2464189/

* Remove unnecessary exception

Co-authored-by: Alexandru Mocanu <alexandru.mocanu@augeos.it>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/lang/it/tokenizer_exceptions.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py
index 0c9968bc6..87c2929bf 100644
--- a/spacy/lang/it/tokenizer_exceptions.py
+++ b/spacy/lang/it/tokenizer_exceptions.py
@@ -17,14 +17,19 @@ _exc = {
 for orth in [
     "..",
     "....",
+    "a.C.",
     "al.",
     "all-path",
     "art.",
     "Art.",
     "artt.",
     "att.",
+    "avv.",
+    "Avv."
     "by-pass",
     "c.d.",
+    "c/c",
+    "C.so",
     "centro-sinistra",
     "check-up",
     "Civ.",
@@ -48,6 +53,8 @@ for orth in [
     "prof.",
     "sett.",
     "s.p.a.",
+    "s.n.c",
+    "s.r.l",
     "ss.",
     "St.",
     "tel.",

From 348d1829c7c3834a37e51f9081a7f2214053e8a2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 30 Mar 2021 12:26:22 +0200
Subject: [PATCH 068/146] Preserve user data for DependencyMatcher on spans
 (#7528)

* Preserve user data for DependencyMatcher on spans

* Clean underscore in test

* Modify test to use extensions stored in user data
---
 spacy/matcher/dependencymatcher.pyx           |  2 +-
 .../tests/matcher/test_dependency_matcher.py  | 27 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 4124696b3..0e601281a 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -299,7 +299,7 @@ cdef class DependencyMatcher:
         if isinstance(doclike, Doc):
             doc = doclike
         elif isinstance(doclike, Span):
-            doc = doclike.as_doc()
+            doc = doclike.as_doc(copy_user_data=True)
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
 
diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py
index a563ddaa2..fb9222aaa 100644
--- a/spacy/tests/matcher/test_dependency_matcher.py
+++ b/spacy/tests/matcher/test_dependency_matcher.py
@@ -4,7 +4,9 @@ import re
 import copy
 from mock import Mock
 from spacy.matcher import DependencyMatcher
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Token
+
+from ..doc.test_underscore import clean_underscore  # noqa: F401
 
 
 @pytest.fixture
@@ -344,3 +346,26 @@ def test_dependency_matcher_long_matches(en_vocab, doc):
     matcher = DependencyMatcher(en_vocab)
     with pytest.raises(ValueError):
         matcher.add("pattern", [pattern])
+
+
+@pytest.mark.usefixtures("clean_underscore")
+def test_dependency_matcher_span_user_data(en_tokenizer):
+    doc = en_tokenizer("a b c d e")
+    for token in doc:
+        token.head = doc[0]
+        token.dep_ = "a"
+    get_is_c = lambda token: token.text in ("c",)
+    Token.set_extension("is_c", default=False)
+    doc[2]._.is_c = True
+    pattern = [
+        {"RIGHT_ID": "c", "RIGHT_ATTRS": {"_": {"is_c": True}}},
+    ]
+    matcher = DependencyMatcher(en_tokenizer.vocab)
+    matcher.add("C", [pattern])
+    doc_matches = matcher(doc)
+    offset = 1
+    span_matches = matcher(doc[offset:])
+    for doc_match, span_match in zip(sorted(doc_matches), sorted(span_matches)):
+        assert doc_match[0] == span_match[0]
+        for doc_t_i, span_t_i in zip(doc_match[1], span_match[1]):
+            assert doc_t_i == span_t_i + offset

From 59c2069eb14d4ae5c9b68a5619c7c31f4d83c249 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 30 Mar 2021 12:43:14 +0200
Subject: [PATCH 069/146] Legacy docs (#7601)

* document legacy Tok2Vec architectures

* add TextCatEnsemble.v1 legacy documentation

* Separate legacy section in side bar
---
 website/docs/api/architectures.md |   4 +-
 website/docs/api/legacy.md        | 143 ++++++++++++++++++++++++++++++
 website/meta/sidebars.json        |   6 ++
 3 files changed, 151 insertions(+), 2 deletions(-)
 create mode 100644 website/docs/api/legacy.md

diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 9b099d8e2..4c4bf73f4 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -647,8 +647,8 @@ from the linear model, where it is stored in `model.attrs["multi_label"]`.
 
 <Accordion title="spacy.TextCatEnsemble.v1 definition" spaced>
 
-The v1 was functionally similar, but used an internal `tok2vec` instead of
-taking it as argument.
+[TextCatEnsemble.v1](/api/legacy#TextCatEnsemble_v1) was functionally similar, but used an internal `tok2vec` instead of
+taking it as argument:
 
 | Name                 | Description                                                                                                                                                                                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md
new file mode 100644
index 000000000..4b5e8df3a
--- /dev/null
+++ b/website/docs/api/legacy.md
@@ -0,0 +1,143 @@
+---
+title: Legacy functions and architectures
+teaser: Archived implementations available through spacy-legacy
+source: spacy/legacy
+---
+
+The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes 
+outdated registered functions and architectures. It is installed automatically as 
+a dependency of spaCy, and provides backwards compatibility for archived functions 
+that may still be used in projects.
+
+You can find the detailed documentation of each such legacy function on this page.
+
+## Architectures {#architectures}
+
+These functions are available from `@spacy.registry.architectures`.
+
+### spacy.Tok2Vec.v1 {#Tok2Vec_v1}
+
+The `spacy.Tok2Vec.v1` architecture was expecting an `encode` model of type 
+`Model[Floats2D, Floats2D]` such as `spacy.MaxoutWindowEncoder.v1` or 
+`spacy.MishWindowEncoder.v1`.
+
+> #### Example config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.Tok2Vec.v1"
+>
+> [model.embed]
+> @architectures = "spacy.CharacterEmbed.v1"
+> # ...
+>
+> [model.encode]
+> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> # ...
+> ```
+
+Construct a tok2vec model out of two subnetworks: one for embedding and one for
+encoding. See the
+["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
+blog post for background.
+
+| Name        | Description                                                                                                                                                                                                                      |
+| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `embed`     | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
+| `encode`    | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~                            |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                           |
+
+### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder_v1}
+
+The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type 
+`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been changed to output 
+type `Model[List[Floats2d], List[Floats2d]]`.
+
+
+> #### Example config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.MaxoutWindowEncoder.v1"
+> width = 128
+> window_size = 1
+> maxout_pieces = 3
+> depth = 4
+> ```
+
+Encode context using convolutions with maxout activation, layer normalization
+and residual connections.
+
+| Name            | Description                                                                                                                                                                                                    |
+| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width`         | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
+| `window_size`   | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~                                                                                           |
+| `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~                                                                                                                                 |
+| `depth`         | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
+| **CREATES**     | The model using the architecture. ~~Model[Floats2d, Floats2d]~~                                                                                                                                                |
+
+### spacy.MishWindowEncoder.v1 {#MishWindowEncoder_v1}
+
+The `spacy.MishWindowEncoder.v1` architecture was producing a model of type 
+`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been changed to output 
+type `Model[List[Floats2d], List[Floats2d]]`.
+
+> #### Example config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.MishWindowEncoder.v1"
+> width = 64
+> window_size = 1
+> depth = 4
+> ```
+
+Encode context using convolutions with
+[`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization
+and residual connections.
+
+| Name          | Description                                                                                                                                                                                                    |
+| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `width`       | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
+| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~                                                                                           |
+| `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
+| **CREATES**   | The model using the architecture. ~~Model[Floats2d, Floats2d]~~                                                                                                                                                |
+
+
+### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
+
+The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and `linear_model`. 
+Since `spacy.TextCatEnsemble.v2`, this has been refactored so that the `TextCatEnsemble` takes these 
+two sublayers as input.
+
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatEnsemble.v1"
+> exclusive_classes = false
+> pretrained_vectors = null
+> width = 64
+> embed_size = 2000
+> conv_depth = 2
+> window_size = 1
+> ngram_size = 1
+> dropout = null
+> nO = null
+> ```
+
+Stacked ensemble of a bag-of-words model and a neural network model. The neural
+network has an internal CNN Tok2Vec layer and uses attention.
+
+| Name                 | Description                                                                                                                                                                                    |
+| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes`  | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
+| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~                                                                                                    |
+| `width`              | Output dimension of the feature encoding step. ~~int~~                                                                                                                                         |
+| `embed_size`         | Input dimension of the feature encoding step. ~~int~~                                                                                                                                          |
+| `conv_depth`         | Depth of the tok2vec layer. ~~int~~                                                                                                                                                            |
+| `window_size`        | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~                                                    |
+| `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
+| `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
+| `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
\ No newline at end of file
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index b4ed46019..a7e87ff72 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -132,6 +132,12 @@
                     { "text": "Classes", "url": "/api/cython-classes" },
                     { "text": "Structs", "url": "/api/cython-structs" }
                 ]
+            },
+            {
+                "label": "Legacy",
+                "items": [
+                    { "text": "Legacy functions", "url": "/api/legacy" }
+                ]
             }
         ]
     }

From 8b3eec6e62122ece5ff5188a9fece06af4f46b92 Mon Sep 17 00:00:00 2001
From: vincent d warmerdam <vincentwarmerdam@gmail.com>
Date: Thu, 1 Apr 2021 14:39:36 +0200
Subject: [PATCH 070/146] Add Tokenwiser to Projects (#7541)

* Add tokenwiser

* Update universe.json
---
 website/meta/universe.json | 43 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index db7657591..e651921ea 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -231,6 +231,49 @@
                 "website": "https://koaning.io"
             }
         },
+        {
+            "id": "tokenwiser",
+            "title": "tokenwiser",
+            "slogan": "Connect vowpal-wabbit & scikit-learn models to spaCy to run simple classification benchmarks. Comes with many utility functions for spaCy pipelines.",
+            "github": "koaning/tokenwiser",
+            "pip": "tokenwiser",
+            "thumb": "https://koaning.github.io/tokenwiser/token.png",
+            "image": "https://koaning.github.io/tokenwiser/logo-tokw.png",
+            "code_example": [
+                "import spacy",
+                "",
+                "from sklearn.pipeline import make_pipeline",
+                "from sklearn.feature_extraction.text import CountVectorizer",
+                "from sklearn.linear_model import LogisticRegression",
+                "",
+                "from tokenwiser.component import attach_sklearn_categoriser",
+                "",
+                "X = [",
+                "    'i really like this post',",
+                "    'thanks for that comment',",
+                "    'i enjoy this friendly forum',",
+                "    'this is a bad post',",
+                "    'i dislike this article',",
+                "    'this is not well written'",
+                "]",
+                "",
+                "y = ['pos', 'pos', 'pos', 'neg', 'neg', 'neg']",
+                "",
+                "# Note that we're training a pipeline here via a single-batch `.fit()` method",
+                "pipe = make_pipeline(CountVectorizer(), LogisticRegression()).fit(X, y)",
+                "",
+                "nlp = spacy.load('en_core_web_sm')",
+                "# This is where we attach our pre-trained model as a pipeline step.",
+                "attach_sklearn_categoriser(nlp, pipe_name='silly_sentiment', estimator=pipe)"
+            ],
+            "category": ["pipeline", "training"],
+            "author": "Vincent D. Warmerdam",
+            "author_links": {
+                "twitter": "fishnets88",
+                "github": "koaning",
+                "website": "https://koaning.io"
+            }
+        },
         {
             "id": "spacy-stanza",
             "title": "spacy-stanza",

From 3c2ce41dd8728dc8ebcfb891d5e10769f0b18127 Mon Sep 17 00:00:00 2001
From: Ayush Chaurasia <ayush.chaurarsia@gmail.com>
Date: Thu, 1 Apr 2021 23:06:23 +0530
Subject: [PATCH 071/146] W&B integration: Optional support for dataset and
 model checkpoint logging and versioning  (#7429)

* Add optional artifacts logging

* Update docs

* Update spacy/training/loggers.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/training/loggers.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update spacy/training/loggers.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Bump WandbLogger Version

* Add documentation of v1 to legacy docs

* bump spacy-legacy to 3.0.2 (to be released)

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
---
 .github/contributors/AyushExel.md | 106 ++++++++++++++++++++++++++++++
 requirements.txt                  |   2 +-
 setup.cfg                         |   2 +-
 spacy/training/loggers.py         |  40 ++++++++++-
 spacy/training/loop.py            |   3 +-
 website/docs/api/legacy.md        |  25 ++++++-
 website/docs/api/top-level.md     |   8 ++-
 website/docs/usage/projects.md    |   2 +-
 8 files changed, 178 insertions(+), 10 deletions(-)
 create mode 100644 .github/contributors/AyushExel.md

diff --git a/.github/contributors/AyushExel.md b/.github/contributors/AyushExel.md
new file mode 100644
index 000000000..281fd0cd0
--- /dev/null
+++ b/.github/contributors/AyushExel.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [X] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Ayush Chaurasia      |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-03-12           |
+| GitHub username                | AyushExel            |
+| Website (optional)             |                      |
diff --git a/requirements.txt b/requirements.txt
index e09a5b221..f86efff3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.0,<3.1.0
+spacy-legacy>=3.0.2,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.2,<8.1.0
diff --git a/setup.cfg b/setup.cfg
index e928e90a6..92e758aec 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     thinc>=8.0.2,<8.1.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.0,<3.1.0
+    spacy-legacy>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 8acf2783c..ef6c86044 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -101,8 +101,13 @@ def console_logger(progress_bar: bool = False):
     return setup_printer
 
 
-@registry.loggers("spacy.WandbLogger.v1")
-def wandb_logger(project_name: str, remove_config_values: List[str] = []):
+@registry.loggers("spacy.WandbLogger.v2")
+def wandb_logger(
+    project_name: str,
+    remove_config_values: List[str] = [],
+    model_log_interval: Optional[int] = None,
+    log_dataset_dir: Optional[str] = None,
+):
     try:
         import wandb
         from wandb import init, log, join  # test that these are available
@@ -119,9 +124,23 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
         for field in remove_config_values:
             del config_dot[field]
         config = util.dot_to_dict(config_dot)
-        wandb.init(project=project_name, config=config, reinit=True)
+        run = wandb.init(project=project_name, config=config, reinit=True)
         console_log_step, console_finalize = console(nlp, stdout, stderr)
 
+        def log_dir_artifact(
+            path: str,
+            name: str,
+            type: str,
+            metadata: Optional[Dict[str, Any]] = {},
+            aliases: Optional[List[str]] = [],
+        ):
+            dataset_artifact = wandb.Artifact(name, type=type, metadata=metadata)
+            dataset_artifact.add_dir(path, name=name)
+            wandb.log_artifact(dataset_artifact, aliases=aliases)
+
+        if log_dataset_dir:
+            log_dir_artifact(path=log_dataset_dir, name="dataset", type="dataset")
+
         def log_step(info: Optional[Dict[str, Any]]):
             console_log_step(info)
             if info is not None:
@@ -133,6 +152,21 @@ def wandb_logger(project_name: str, remove_config_values: List[str] = []):
                     wandb.log({f"loss_{k}": v for k, v in losses.items()})
                 if isinstance(other_scores, dict):
                     wandb.log(other_scores)
+                if model_log_interval and info.get("output_path"):
+                    if info["step"] % model_log_interval == 0 and info["step"] != 0:
+                        log_dir_artifact(
+                            path=info["output_path"],
+                            name="pipeline_" + run.id,
+                            type="checkpoint",
+                            metadata=info,
+                            aliases=[
+                                f"epoch {info['epoch']} step {info['step']}",
+                                "latest",
+                                "best"
+                                if info["score"] == max(info["checkpoints"])[0]
+                                else "",
+                            ],
+                        )
 
         def finalize() -> None:
             console_finalize()
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 55919014b..a1242aea6 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -96,12 +96,13 @@ def train(
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
     try:
         for batch, info, is_best_checkpoint in training_step_iterator:
-            log_step(info if is_best_checkpoint is not None else None)
             if is_best_checkpoint is not None:
                 with nlp.select_pipes(disable=frozen_components):
                     update_meta(T, nlp, info)
                 if output_path is not None:
                     save_checkpoint(is_best_checkpoint)
+                    info["output_path"] = str(output_path / DIR_MODEL_LAST)
+            log_step(info if is_best_checkpoint is not None else None)
     except Exception as e:
         if output_path is not None:
             stdout.write(
diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md
index 4b5e8df3a..3e5c7f75f 100644
--- a/website/docs/api/legacy.md
+++ b/website/docs/api/legacy.md
@@ -140,4 +140,27 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `ngram_size`         | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~                                            |
 | `dropout`            | The dropout rate. ~~float~~                                                                                                                                                                    |
 | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
\ No newline at end of file
+| **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
+
+
+## Loggers {#loggers}
+
+These functions are available from `@spacy.registry.loggers`.
+
+### spacy.WandbLogger.v1 {#WandbLogger_v1}
+
+The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet 
+support the `log_dataset_dir` and `model_log_interval` arguments.
+
+> #### Example config
+>
+> ```ini
+> [training.logger]
+> @loggers = "spacy.WandbLogger.v1"
+> project_name = "monitor_spacy_training"
+> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
+> ```
+| Name                   | Description                                                                                                                           |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
+| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index eef8958cf..38bc40b11 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -461,7 +461,7 @@ start decreasing across epochs.
 
  </Accordion>
 
-#### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"}
+#### spacy.WandbLogger.v2 {#WandbLogger tag="registered function"}
 
 > #### Installation
 >
@@ -493,15 +493,19 @@ remain in the config file stored on your local system.
 >
 > ```ini
 > [training.logger]
-> @loggers = "spacy.WandbLogger.v1"
+> @loggers = "spacy.WandbLogger.v2"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
+> log_dataset_dir = "corpus"
+> model_log_interval = 1000
 > ```
 
 | Name                   | Description                                                                                                                           |
 | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
 | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
+| `model_log_interval`   | Steps to wait between logging model checkpoints to W&B dasboard (default: None). ~~Optional[int]~~                                    |
+| `log_dataset_dir`      | Directory containing dataset to be logged and versioned as W&B artifact (default: None). ~~Optional[str]~~                            |
 
 <Project id="integrations/wandb">
 
diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md
index 97b5b9f28..fc191824a 100644
--- a/website/docs/usage/projects.md
+++ b/website/docs/usage/projects.md
@@ -995,7 +995,7 @@ your results.
 >
 > ```ini
 > [training.logger]
-> @loggers = "spacy.WandbLogger.v1"
+> @loggers = "spacy.WandbLogger.v2"
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > ```

From f6ad4684bd168bce45ffd7984a052aa85c79b369 Mon Sep 17 00:00:00 2001
From: Sam Edwardes <edwardes.s@gmail.com>
Date: Sun, 4 Apr 2021 11:17:57 -0700
Subject: [PATCH 072/146] Updates to universe.json for spaCyTextBlob (#7647)

* Updates to universe.json for spaCyTextBlob

Updated the documentation for spaCy 3.0.

* SamEdwardes.md

* Update SamEdwardes.md
---
 .github/contributors/SamEdwardes.md | 106 ++++++++++++++++++++++++++++
 website/meta/universe.json          |  13 ++--
 2 files changed, 112 insertions(+), 7 deletions(-)
 create mode 100644 .github/contributors/SamEdwardes.md

diff --git a/.github/contributors/SamEdwardes.md b/.github/contributors/SamEdwardes.md
new file mode 100644
index 000000000..4e6453ac7
--- /dev/null
+++ b/.github/contributors/SamEdwardes.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sam Edwardes         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-04-02           |
+| GitHub username                | SamEdwardes          |
+| Website (optional)             | samedwardes.com      |
diff --git a/website/meta/universe.json b/website/meta/universe.json
index e651921ea..7aba03fd1 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -62,8 +62,8 @@
         {
             "id": "spacy-textblob",
             "title": "spaCyTextBlob",
-            "slogan": "Easy sentiment analysis for spaCy using TextBlob",
-            "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extenstion `._.sentiment` to `Doc`, `Span`, and `Token` objects.",
+            "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!",
+            "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`",
             "github": "SamEdwardes/spaCyTextBlob",
             "pip": "spacytextblob",
             "code_example": [
@@ -71,13 +71,12 @@
                 "from spacytextblob.spacytextblob import SpacyTextBlob",
                 "",
                 "nlp = spacy.load('en_core_web_sm')",
-                "spacy_text_blob = SpacyTextBlob()",
-                "nlp.add_pipe(spacy_text_blob)",
+                "nlp.add_pipe('spacytextblob')",
                 "text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'",
                 "doc = nlp(text)",
-                "doc._.sentiment.polarity      # Polarity: -0.125",
-                "doc._.sentiment.subjectivity  # Sujectivity: 0.9",
-                "doc._.sentiment.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]"
+                "doc._.polarity      # Polarity: -0.125",
+                "doc._.subjectivity  # Sujectivity: 0.9",
+                "doc._.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]"
             ],
             "code_language": "python",
             "url": "https://spacytextblob.netlify.app/",

From 7944761ba7335048b6d81784cfdcedecf87b3cac Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 5 Apr 2021 03:20:24 +0900
Subject: [PATCH 073/146] Add warning if initial vectors are empty (#7641)

See #7637, where this came up.
---
 spacy/errors.py              | 2 ++
 spacy/training/initialize.py | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/spacy/errors.py b/spacy/errors.py
index 289d2cfed..89b09c09a 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -157,6 +157,8 @@ class Warnings:
             "`spacy.load()` to ensure that the model is loaded on the correct "
             "device. More information: "
             "http://spacy.io/usage/v3#jupyter-notebook-gpu")
+    W112 = ("The model specified to use for initial vectors ({name}) has no "
+            "vectors. This is almost certainly a mistake.")
 
 
 @add_codes
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index f623627eb..69861a9a9 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -133,6 +133,10 @@ def load_vectors_into_model(
         )
         err = ConfigValidationError.from_error(e, title=title, desc=desc)
         raise err from None
+
+    if len(vectors_nlp.vocab.vectors.keys()) == 0:
+        logger.warning(Warnings.W112.format(name=name))
+
     nlp.vocab.vectors = vectors_nlp.vocab.vectors
     if add_strings:
         # I guess we should add the strings from the vectors_nlp model?

From 93ee74a0a68b5224378de57280b14577f8b7dcd2 Mon Sep 17 00:00:00 2001
From: Jaidev Deshpande <deshpande.jaidev@gmail.com>
Date: Mon, 5 Apr 2021 22:32:27 +0530
Subject: [PATCH 074/146] Add Numerizer to SpaCy universe (#7650)

Numerizer is a spaCy extension that converts numbers written in natural language
into numeric strings.
---
 website/meta/universe.json | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 7aba03fd1..dcc9ce3d4 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,27 @@
 {
     "resources": [
+	{
+	    "id": "numerizer",
+	    "title": "numerizer",
+	    "slogan": "Convert natural language numerics into ints and floats.",
+	    "description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.",
+	    "github": "jaidevd/numerizer",
+	    "pip": "numerizer",
+	    "code_example": [
+		"from spacy import load",
+		"import numerizer",
+		"nlp = load('en_core_web_sm') # or any other model",
+		"doc = nlp('The Hogwarts Express is at platform nine and three quarters')",
+		"doc._.numerize()",
+		"# {nine and three quarters: '9.75'}"
+	    ],
+	    "author": "Jaidev Deshpande",
+	    "author_links": {
+		"github": "jaidevd",
+		"twitter": "jaidevd"
+	    },
+	    "category": ["standalone"]
+	},
         {
             "id": "spikex",
             "title": "SpikeX - SpaCy Pipes for Knowledge Extraction",

From 1d1cfadbca44193702b159b3f6545fb59b56da65 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Apr 2021 14:13:13 +1000
Subject: [PATCH 075/146] Fix formatting [ci skip]

---
 website/docs/api/doc.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 45feb8774..c8917efa1 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -234,14 +234,14 @@ Set the named entities in the document.
 > assert ents[0].text == "Mr. Best"
 > ```
 
-| Name           | Description                                                                                                                                                                               |
-| -------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| entities       | Spans with labels to set as entities. ~~List[Span]~~                                                                                                                                      |
-| _keyword-only_ |                                                                                                                                                                                           |
-| blocked        | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~                                        |
-| missing        | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~                                                                                                                   |
-| outside        | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                            |
-| default        | How to set entity annotation for tokens outside of any provided spans. Options: "blocked", "missing", "outside" and "unmodified" (preserve current state). Defaults to "outside". ~~str~~ |
+| Name           | Description                                                                                                                                                                                         |
+| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `entities`     | Spans with labels to set as entities. ~~List[Span]~~                                                                                                                                                |
+| _keyword-only_ |                                                                                                                                                                                                     |
+| `blocked`      | Spans to set as "blocked" (never an entity) for spacy's built-in NER component. Other components may ignore this setting. ~~Optional[List[Span]]~~                                                  |
+| `missing`      | Spans with missing/unknown entity information. ~~Optional[List[Span]]~~                                                                                                                             |
+| `outside`      | Spans outside of entities (O in IOB). ~~Optional[List[Span]]~~                                                                                                                                      |
+| `default`      | How to set entity annotation for tokens outside of any provided spans. Options: `"blocked"`, `"missing"`, `"outside"` and `"unmodified"` (preserve current state). Defaults to `"outside"`. ~~str~~ |
 
 ## Doc.similarity {#similarity tag="method" model="vectors"}
 

From 5bbdd7dc4c47f153e6e0de5ee4156cf52ab2695f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Apr 2021 14:13:22 +1000
Subject: [PATCH 076/146] Update pipeline design docs [ci skip]

---
 website/docs/images/pipeline-design.svg | 49 +++++++++++++++++++++++++
 website/docs/models/index.md            | 14 +++----
 2 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 website/docs/images/pipeline-design.svg

diff --git a/website/docs/images/pipeline-design.svg b/website/docs/images/pipeline-design.svg
new file mode 100644
index 000000000..88ccdab99
--- /dev/null
+++ b/website/docs/images/pipeline-design.svg
@@ -0,0 +1,49 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="837" height="259" viewBox="0 0 837 259">
+  <defs>
+    <linearGradient id="a" x1="60.6%" x2="50%" y1="100%" y2="100%">
+      <stop offset="0%" stop-color="#B2D73A"/>
+      <stop offset="100%" stop-color="#F0A439"/>
+    </linearGradient>
+    <linearGradient id="b" x1="60.6%" x2="50%" y1="100%" y2="100%">
+      <stop offset="0%" stop-color="#CDB217"/>
+      <stop offset="100%" stop-color="#F0A439"/>
+    </linearGradient>
+    <linearGradient id="c" x1="100%" x2="0%" y1="50%" y2="50%">
+      <stop offset="0%" stop-color="#3AD787"/>
+      <stop offset="100%" stop-color="#CDB217"/>
+    </linearGradient>
+    <linearGradient id="d" x1="100%" x2="0%" y1="50%" y2="50%">
+      <stop offset="0%" stop-color="#3A8DD7"/>
+      <stop offset="100%" stop-color="#3AD787"/>
+    </linearGradient>
+  </defs>
+  <g fill="none" fill-rule="evenodd">
+    <path fill="#F2D7B2" stroke="#F0A439" stroke-linejoin="round" stroke-width="3.8" d="M27 90h148.6l23.4 40.9-23.4 39.1H27l23.4-39z"/>
+    <path fill="#F2E7A6" stroke="#CDB217" stroke-linejoin="round" stroke-width="3.8" d="M161 90h134.6l23.4 40.9-23.4 39.1H161l23.4-39z"/>
+    <path fill="#D7E99A" stroke="#B2D73A" stroke-linejoin="round" stroke-width="3.8" d="M286 90h134.6l23.4 40.9-23.4 39.1H286l23.4-39z"/>
+    <path fill="#B5F3D4" stroke="#3AD787" stroke-linejoin="round" stroke-width="3.8" d="M417 90h134.6l23.4 40.9-23.4 39.1H417l23.4-39z"/>
+    <path fill="#B5D8F3" stroke="#3A8DD7" stroke-linejoin="round" stroke-width="3.8" d="M537 90h134.6l23.4 40.9-23.4 39.1H537l23.4-39z"/>
+    <rect width="100.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(169 156)"/>
+    <rect width="100.5" height="23.5" x="4.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="11.8" transform="translate(296 156)"/>
+    <rect width="47.5" height="16.5" x="1.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="8.3" transform="translate(453 81)"/>
+    <path fill="#FFF" fill-rule="nonzero" d="M462 96v-4.8c0-.5.2-.9.4-1.1.3-.3.6-.5 1-.5.6 0 1 .2 1.2.5.3.2.4.6.4 1.1v.4h1.9v-.5a3 3 0 00-.7-2.2c-.5-.5-1.2-.7-2-.7-.6 0-1 0-1.4.3-.4.3-.6.7-.7 1.1h-.1v-1.3h-1.7V96h1.8zm9.2.1c1 0 1.8-.2 2.3-.8a3 3 0 00.8-2.2v-4.8h-1.7v4.8c0 1-.5 1.5-1.4 1.5-1 0-1.4-.5-1.4-1.5v-4.8H468v4.8c0 1 .3 1.7.9 2.2.5.6 1.3.8 2.3.8zm11.6-.1v-1.6h-2.3c-.2 0-.5 0-.6-.2a.9.9 0 01-.3-.7V86h-4.2v1.5h2.5v6.1c0 .8.2 1.4.7 1.8.4.5 1 .7 1.8.7h2.4zm4.4.1l1.2-.1 1-.4c.2-.2.5-.4.6-.7l.5-.8h-1.8c0 .2-.3.3-.5.5l-1 .1c-.5 0-.9-.1-1.2-.4-.3-.3-.4-.7-.4-1.2v-.5h5v-1.4c0-.5-.2-.9-.3-1.3l-.7-1a3 3 0 00-1-.5 4 4 0 00-1.4-.2c-.5 0-1 0-1.4.2a3 3 0 00-1 .6c-.3.2-.5.6-.7 1-.2.3-.2.7-.2 1.2v2a2.7 2.7 0 002 2.8l1.3.1zm1.6-4.7h-3.2v-.2c0-.5.1-1 .4-1.2.3-.3.7-.4 1.2-.4s.9.1 1.2.4c.3.3.4.7.4 1.2v.2zm6.7 4.7c1 0 1.7-.2 2.3-.6.5-.4.8-1 .8-1.7a2 2 0 00-.7-1.7c-.4-.3-1.1-.6-2-.7l-1.1-.2c-.7 0-1-.3-1-.8 0-.6.3-.9 1.2-.9h.5c.3 0 .6 0 .8.2.2.1.4.3.4.5h1.8c-.1-.6-.4-1.1-1-1.5-.4-.4-1.1-.5-2-.5h-.5c-1 0-1.7.2-2.2.5a2 2 0 00-.8 1.7c0 .7.2 1.3.6 1.6.4.4 1.1.7 2 .8h1c.9.2 1.3.5 1.3 1 0 .3-.2.6-.4.7-.2.2-.6.3-1 .3h-.6c-.3 0-.6 0-.8-.2a.9.9 0 01-.5-.6h-1.8c.1.7.4 1.2 1 1.6.5.4 1.2.5 2.1.5h.6z"/>
+    <rect width="47.5" height="16.5" x="1.8" y="1.8" fill="#3D4251" stroke="#3D4251" stroke-width="3.5" rx="8.3" transform="translate(586 81)"/>
+    <path fill="#FFF" fill-rule="nonzero" d="M595 96v-4.8c0-.5.2-.9.4-1.1.3-.3.6-.5 1-.5.6 0 1 .2 1.2.5.3.2.4.6.4 1.1v.4h1.9v-.5a3 3 0 00-.7-2.2c-.5-.5-1.2-.7-2-.7-.6 0-1 0-1.4.3-.4.3-.6.7-.7 1.1h-.1v-1.3h-1.7V96h1.8zm9.2.1c1 0 1.8-.2 2.3-.8a3 3 0 00.8-2.2v-4.8h-1.7v4.8c0 1-.5 1.5-1.4 1.5-1 0-1.4-.5-1.4-1.5v-4.8H601v4.8c0 1 .3 1.7.9 2.2.5.6 1.3.8 2.3.8zm11.6-.1v-1.6h-2.3c-.2 0-.5 0-.6-.2a.9.9 0 01-.3-.7V86h-4.2v1.5h2.5v6.1c0 .8.2 1.4.7 1.8.4.5 1 .7 1.8.7h2.4zm4.4.1l1.2-.1 1-.4c.2-.2.5-.4.6-.7l.5-.8h-1.8c0 .2-.3.3-.5.5l-1 .1c-.5 0-.9-.1-1.2-.4-.3-.3-.4-.7-.4-1.2v-.5h5v-1.4c0-.5-.2-.9-.3-1.3l-.7-1a3 3 0 00-1-.5 4 4 0 00-1.4-.2c-.5 0-1 0-1.4.2a3 3 0 00-1 .6c-.3.2-.5.6-.7 1-.2.3-.2.7-.2 1.2v2a2.7 2.7 0 002 2.8l1.3.1zm1.6-4.7h-3.2v-.2c0-.5.1-1 .4-1.2.3-.3.7-.4 1.2-.4s.9.1 1.2.4c.3.3.4.7.4 1.2v.2zm6.7 4.7c1 0 1.7-.2 2.3-.6.5-.4.8-1 .8-1.7a2 2 0 00-.7-1.7c-.4-.3-1.1-.6-2-.7l-1.1-.2c-.7 0-1-.3-1-.8 0-.6.3-.9 1.2-.9h.5c.3 0 .6 0 .8.2.2.1.4.3.4.5h1.8c-.1-.6-.4-1.1-1-1.5-.4-.4-1.1-.5-2-.5h-.5c-1 0-1.7.2-2.2.5a2 2 0 00-.8 1.7c0 .7.2 1.3.6 1.6.4.4 1.1.7 2 .8h1c.9.2 1.3.5 1.3 1 0 .3-.2.6-.4.7-.2.2-.6.3-1 .3h-.6c-.3 0-.6 0-.8-.2a.9.9 0 01-.5-.6h-1.8c.1.7.4 1.2 1 1.6.5.4 1.2.5 2.1.5h.6z"/>
+    <path stroke="url(#a)" stroke-linecap="square" stroke-width="4" d="M264 7v39.6L8 46.7V30.9" transform="translate(87 178)"/>
+    <path stroke="url(#b)" stroke-linecap="square" stroke-width="4" d="M137 7v27.6L8 34.7V18.9" transform="translate(87 178)"/>
+    <path fill="#F0A439" d="M95 179l8 16H87z"/>
+    <path fill="#3D4251" fill-rule="nonzero" d="M207.3 133.2l1.8-.3c.6-.2 1-.5 1.5-.8l-1-1.5v-.2l-.3-.1h-.2a4.4 4.4 0 01-.6.3l-.5.1a1 1 0 01-.8-.3c-.2-.3-.3-.6-.3-1v-7h3.4v-2.1h-3.4v-4h-2.1l-.2.5-.7 3.5-2 .3v1.2c0 .3 0 .4.2.5l.4.2h1.3v7.2c0 1 .3 2 .9 2.6.6.6 1.4.9 2.6.9zm8.5 0l1.3-.1a5 5 0 002-1l1-.7.2 1 .4.5.7.1h1.4v-8.1a6 6 0 00-.4-2c-.2-.6-.5-1.1-.9-1.6-.4-.4-.9-.7-1.5-1-.6-.2-1.2-.4-2-.4-2 0-3.8.7-5.3 2l.5 1 .4.4c.2.2.3.2.5.2.3 0 .5 0 .7-.2a31.8 31.8 0 001.6-.8l1.2-.1c.7 0 1.3.2 1.6.6.4.4.6 1 .6 1.9v.8c-1.5 0-2.7.1-3.7.4-1 .2-1.8.5-2.4 1-.6.3-1 .7-1.2 1.2a3 3 0 00-.4 1.4c0 .6 0 1.1.3 1.5a3 3 0 002 1.8l1.4.2zm1-2.1a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1.1 0-.3.1-.6.3-.8l.7-.6a6 6 0 011.5-.4 17 17 0 012.3-.3v2.2l-.7.6a3.3 3.3 0 01-1.4.7h-1zm13.5 6.4c1 0 1.8-.1 2.6-.4.8-.2 1.4-.6 2-1 .5-.4.9-.9 1.2-1.4.2-.5.4-1 .4-1.6 0-.7-.1-1.2-.4-1.5-.2-.4-.5-.7-1-1l-1.2-.5a12 12 0 00-3-.3h-1.4a3 3 0 01-1-.3.6.6 0 01-.3-.6c0-.3.2-.5.6-.8a7.2 7.2 0 003.4-.1c.7-.2 1.2-.5 1.6-.9a3.9 3.9 0 001.2-4.4l1.1-.3c.3 0 .5 0 .6-.2l.1-.4v-1.2h-3.7l-1.3-.5a6.8 6.8 0 00-5.2 1 3.9 3.9 0 00-1.4 3 3.7 3.7 0 002 3.5 3.6 3.6 0 00-1.2 1l-.3.6-.1.6c0 .5 0 .8.3 1.1.2.3.4.6.7.7-.6.3-1.1.6-1.5 1-.4.4-.5 1-.5 1.5s0 .9.3 1.3c.2.4.6.8 1 1 .5.4 1.1.6 1.8.8.8.2 1.6.3 2.6.3zm0-11.2a3 3 0 01-1-.2 1.8 1.8 0 01-1.1-1.1c-.2-.2-.2-.5-.2-.8 0-.7.2-1.2.5-1.5.4-.4 1-.6 1.7-.6.8 0 1.4.2 1.8.6.3.3.5.8.5 1.5l-.1.8a1.8 1.8 0 01-1.2 1.1l-1 .2zm0 9l-1.4-.1c-.4 0-.7-.2-1-.3l-.5-.5-.2-.7c0-.3.1-.6.4-.9.2-.2.4-.5.8-.7a15.1 15.1 0 002.4.2 16.7 16.7 0 012 .3c.3 0 .5.2.6.4.2.1.2.3.2.6 0 .2 0 .5-.2.7 0 .2-.3.4-.6.5-.2.2-.6.3-1 .4l-1.5.1zm13 2.2c.9 0 1.8-.1 2.5-.4.8-.2 1.5-.6 2-1s1-.9 1.2-1.4c.3-.5.4-1 .4-1.6 0-.7-.1-1.2-.3-1.5l-1-1-1.3-.5a12 12 0 00-3-.3h-1.3a3 3 0 01-1-.3.6.6 0 01-.3-.6c0-.3.2-.5.6-.8a7.2 7.2 0 003.4-.1c.6-.2 1.1-.5 1.6-.9a3.9 3.9 0 001.1-4.4l1.2-.3c.2 0 .4 0 .5-.2l.1-.4v-1.2h-3.6l-1.4-.5a6.8 6.8 0 00-5.2 1 3.9 3.9 0 00-1.4 3 3.7 3.7 0 002 3.5 3.6 3.6 0 00-1.2 1l-.3.6v.6c0 .5 0 .8.2 1.1.2.3.5.6.8.7-.7.3-1.2.6-1.6 1-.3.4-.5 1-.5 1.5s.1.9.3 1.3c.3.4.6.8 1.1 1 .5.4 1 .6 1.8.8.7.2 1.5.3 2.5.3zm-.1-11.2a3 3 0 01-1-.2 1.8 1.8 0 01-1.1-1.1l-.2-.8c0-.7.2-1.2.6-1.5.4-.4 1-.6 1.7-.6s1.3.2 1.7.6c.4.3.6.8.6 1.5 0 .3 0 .6-.2.8a1.8 1.8 0 01-1.1 1.1l-1 .2zm0 9l-1.4-.1c-.4 0-.7-.2-1-.3l-.5-.5-.1-.7c0-.3 0-.6.3-.9l.8-.7a15.1 15.1 0 002.4.2 16.7 16.7 0 012 .3c.3 0 .5.2.6.4.2.1.3.3.3.6 0 .2 0 .5-.2.7-.1.2-.3.4-.6.5l-1 .4-1.5.1zm14-2.1a10.1 10.1 0 002.7-.5 5.1 5.1 0 002.5-1.7l-.9-1-.2-.3h-.3c-.2 0-.4 0-.6.2a22 22 0 01-1.7.7l-1.3.2c-1 0-1.8-.3-2.4-1-.7-.5-1-1.5-1.1-2.8h8.3l.2-.3.1-.3v-.6c0-1-.1-1.8-.4-2.5-.3-.7-.6-1.3-1.1-1.8-.5-.5-1.1-.9-1.8-1.1-.7-.3-1.4-.4-2.2-.4-1 0-1.8.1-2.6.5a5.7 5.7 0 00-3.2 3.3c-.2.8-.4 1.6-.4 2.5 0 1.1.2 2.1.5 3a5.6 5.6 0 003.3 3.5c.8.2 1.6.4 2.5.4zm2.6-8H254c.1-1 .4-1.7 1-2.3.4-.5 1.1-.7 2-.7.5 0 .9 0 1.2.2l.9.6.5 1 .2 1.1zm8.3 7.8v-8a4 4 0 011.2-1.5c.5-.4 1-.6 1.6-.6l1 .1.4.1h.4l.1-.4.4-2.3c-.4-.3-1-.5-1.6-.5-.7 0-1.4.3-2 .7a6 6 0 00-1.6 1.9l-.2-1.5c0-.3-.2-.5-.3-.7l-.6-.1H265V133h3.1zM71 138.2l1.8-.3c.6-.2 1-.5 1.5-.8l-.9-1.5-.2-.2-.2-.1h-.2a4.4 4.4 0 01-.6.3l-.5.1a1 1 0 01-.8-.3c-.2-.3-.3-.6-.3-1v-7H74v-2.1h-3.4v-4h-2.1l-.2.5-.6 3.5-2.1.3v1.2c0 .3 0 .4.2.5l.4.2h1.3v7.2c0 1 .3 2 .9 2.6.6.6 1.5.9 2.6.9zm10.9 0a7 7 0 002.6-.5 5.5 5.5 0 003.3-3.4 8 8 0 00.4-2.7c0-1-.1-2-.4-2.8-.3-.8-.8-1.5-1.3-2-.5-.6-1.2-1-2-1.4a7 7 0 00-2.6-.4 7 7 0 00-2.6.4 5.7 5.7 0 00-3.3 3.4c-.3.8-.5 1.8-.5 2.8 0 1 .2 1.9.5 2.7a5.6 5.6 0 003.3 3.4 7 7 0 002.6.5zm0-2.4c-1 0-1.9-.3-2.4-1-.5-.8-.8-1.8-.8-3.2 0-1.4.3-2.4.8-3.2.5-.7 1.3-1 2.4-1 1 0 1.9.3 2.4 1 .5.8.7 1.8.7 3.2 0 1.4-.2 2.4-.7 3.1-.5.8-1.3 1.1-2.4 1.1zm11.9 2.2v-5.9h.7l.5.1c.2 0 .3.2.4.4l3.3 4.8c.1.3.2.4.4.5l.6.1h2.8l-4.3-6.2a10 10 0 00-.4-.6l-.4-.4.5-.4.4-.5 4-4.7h-2.8l-.7.1-.4.4-3.2 4-.4.3h-1v-10.6h-3.1V138h3zm22.4 0v-1.8a1 1 0 00-.3-.8c-.2-.2-.4-.3-.8-.3h-4.6l-1.1.1-1.2.3 4.1-4.2a72.2 72.2 0 002.5-3c.4-.5.6-1 .8-1.6a5.6 5.6 0 000-4 4.7 4.7 0 00-3-2.6 7.4 7.4 0 00-4.5 0 5.6 5.6 0 00-3.2 2.6c-.4.6-.6 1.4-.8 2.3l1.7.3h.4c.3 0 .5 0 .7-.2l.4-.7a3 3 0 011-1.5c.5-.4 1-.6 1.8-.6.4 0 .8 0 1.1.2a2.3 2.3 0 011.4 1.3l.2 1.2-.2 1.3c-.2.5-.3.9-.6 1.3a26 26 0 01-2.1 2.6l-5.5 5.5a1.8 1.8 0 00-.6 1.3v1h12.4zm9.1 0l5.1-12.8H128l-.5.1c-.2.1-.3.3-.3.5l-2.6 6.8a10.8 10.8 0 00-.6 2.3 22.8 22.8 0 00-.6-2.3l-2.5-6.8c0-.2-.2-.4-.3-.5a1 1 0 00-.6-.1h-2.5l5 12.8h2.8zm12.4.2a10.1 10.1 0 002.8-.5c.5-.1 1-.3 1.3-.6.5-.3.9-.6 1.2-1l-.9-1.1-.2-.3h-.4c-.2 0-.4 0-.6.2a22 22 0 01-1.6.7l-1.3.2c-1 0-1.8-.3-2.5-1-.6-.5-1-1.5-1-2.8H142.7l.3-.3v-.3l.1-.6c0-1-.1-1.8-.4-2.5-.3-.7-.7-1.3-1.1-1.8-.5-.5-1.1-.9-1.8-1.1-.7-.3-1.4-.4-2.2-.4-1 0-1.9.1-2.6.5a5.7 5.7 0 00-3.2 3.3c-.3.8-.4 1.6-.4 2.5 0 1.1.2 2.1.5 3a5.6 5.6 0 003.3 3.5c.8.2 1.6.4 2.5.4zm2.6-8h-5.8c.2-1 .5-1.7 1-2.2.5-.6 1.2-.8 2.1-.8.5 0 .9 0 1.2.2l.9.7c.2.2.4.5.5.9l.1 1.1zm10.2 8a9.5 9.5 0 002.7-.5 6 6 0 002.4-1.6l-1-1.1a.6.6 0 00-.5-.3c-.2 0-.3 0-.5.2l-.5.4-.8.4-1.2.1c-.5 0-1 0-1.3-.2-.4-.2-.8-.5-1-.9-.3-.3-.5-.8-.7-1.3a7 7 0 01-.2-1.8c0-.7 0-1.3.2-1.8l.6-1.3c.3-.4.6-.7 1-.9l1.5-.3a3.4 3.4 0 011.8.5l.5.3.5.2.4-.1.2-.3.8-1.1c-.5-.6-1.2-1-1.9-1.4-.7-.3-1.6-.4-2.5-.4-1 0-2 .1-2.7.5a5.5 5.5 0 00-3.1 3.4c-.3.8-.4 1.7-.4 2.7 0 1 .1 2 .4 2.8.3.8.7 1.5 1.3 2 .5.6 1 1 1.8 1.3.7.3 1.4.5 2.2.5zM330.7 137.2v-5.3l1.4 1c.5.2 1.1.3 1.8.3a5 5 0 004-2c.5-.5.9-1.2 1.2-2l.3-2.7c0-1 0-2-.3-2.8a6 6 0 00-1-2c-.4-.6-.9-1-1.5-1.3-.5-.3-1.2-.5-1.9-.5-.9 0-1.7.2-2.3.6-.7.4-1.3.8-1.8 1.4l-.3-1.2c0-.2-.1-.3-.3-.4a1 1 0 00-.5-.1h-1.9v17h3.1zm2.5-6.4c-.5 0-1 0-1.3-.2a3 3 0 01-1.2-1V124c.4-.5.9-.8 1.3-1.1a3.2 3.2 0 012.7-.2l.9.7.5 1.3a8.6 8.6 0 010 3.8 4 4 0 01-.7 1.4c-.2.3-.5.6-1 .8a3 3 0 01-1.2.2zm11.7 2.4l1.2-.1a5 5 0 002-1c.4-.1.7-.4 1-.7l.3 1c0 .2.2.4.4.5l.6.1h1.4v-8.1a6 6 0 00-.3-2c-.2-.6-.5-1.1-1-1.6-.3-.4-.8-.7-1.4-1-.6-.2-1.3-.4-2-.4-2.1 0-3.9.7-5.4 2l.6 1 .4.4c.1.2.3.2.5.2.3 0 .5 0 .7-.2a31.8 31.8 0 001.6-.8l1.2-.1c.7 0 1.2.2 1.6.6.3.4.5 1 .5 1.9v.8c-1.4 0-2.7.1-3.6.4-1 .2-1.8.5-2.4 1-.6.3-1 .7-1.3 1.2a3 3 0 00-.4 1.4c0 .6.1 1.1.3 1.5a3 3 0 002 1.8l1.5.2zm1-2.1a2 2 0 01-1.3-.4c-.3-.2-.5-.6-.5-1.1 0-.3 0-.6.2-.8l.8-.6a6 6 0 011.5-.4 17 17 0 012.2-.3v2.2l-.6.6a3.3 3.3 0 01-1.4.7h-1zm12 1.9v-8a4 4 0 011.2-1.5c.5-.4 1-.6 1.7-.6l.8.1.5.1h.4l.1-.4.4-2.3c-.4-.3-.9-.5-1.5-.5-.8 0-1.5.3-2.1.7a6 6 0 00-1.6 1.9l-.2-1.5c0-.3-.1-.5-.3-.7l-.6-.1h-1.8V133h3zm10.5.2a7 7 0 002.2-.3c.6-.2 1.2-.5 1.6-1 .4-.3.8-.7 1-1.3.2-.5.3-1 .3-1.7 0-.5 0-1-.2-1.3a3 3 0 00-.7-1 4 4 0 00-1-.6 12.5 12.5 0 00-2.4-.8 7 7 0 01-1-.4l-.7-.5a1 1 0 01-.2-.7c0-.4.1-.7.5-1 .3-.2.8-.4 1.4-.4a3.7 3.7 0 011.8.4 32.9 32.9 0 011 .4h.4l.3-.3.7-1.1c-.5-.5-1-.9-1.8-1.2-.7-.3-1.5-.4-2.4-.4-.8 0-1.5 0-2.1.3-.6.2-1.1.5-1.5.8a3.5 3.5 0 00-1.3 2.8c0 .6.1 1 .3 1.4.2.4.4.8.7 1l1 .7a10.8 10.8 0 002.4.9l1 .3.7.6c.2.1.3.4.3.7 0 .2 0 .4-.2.6 0 .2-.2.3-.3.5a2 2 0 01-.7.3 3 3 0 01-1 .1 3.5 3.5 0 01-1.9-.5 12 12 0 01-.6-.3 1 1 0 00-.5-.2 1 1 0 00-.5.1 1 1 0 00-.3.3l-.7 1.2.8.7a6.7 6.7 0 002.3.8l1.3.1zm13 0a10.1 10.1 0 002.9-.5 5.1 5.1 0 002.5-1.7l-1-1-.2-.3h-.3c-.2 0-.4 0-.6.2a22 22 0 01-1.7.7l-1.3.2c-1 0-1.8-.3-2.4-1-.6-.5-1-1.5-1-2.8h8.2l.2-.3.1-.3v-.6c0-1 0-1.8-.4-2.5-.2-.7-.6-1.3-1.1-1.8-.5-.5-1-.9-1.8-1.1-.6-.3-1.4-.4-2.2-.4-1 0-1.8.1-2.6.5a5.7 5.7 0 00-3.1 3.3c-.3.8-.5 1.6-.5 2.5 0 1.1.2 2.1.5 3 .4.8.8 1.6 1.4 2.1.5.6 1.2 1 2 1.3.7.3 1.6.5 2.5.5zm2.7-8h-5.8c.1-1 .5-1.7 1-2.3.5-.5 1.2-.7 2-.7.5 0 1 0 1.3.2l.8.6c.3.3.4.6.5 1l.2 1.1zm8.4 7.8v-8a4 4 0 011.1-1.5c.5-.4 1-.6 1.7-.6l.8.1.5.1h.4l.1-.4.4-2.3c-.4-.3-.9-.5-1.5-.5-.8 0-1.5.3-2.1.7a6 6 0 00-1.6 1.9l-.2-1.5c0-.3-.1-.5-.3-.7l-.6-.1h-1.8V133h3zM454 128.2l1-.1a4 4 0 001.6-.7l.8-.7.2.8c0 .2.2.3.3.4l.5.1h1.2v-6.5c0-.6-.1-1-.3-1.6-.2-.5-.4-.9-.8-1.2-.3-.4-.7-.7-1.1-.9-.5-.2-1-.3-1.6-.3-1.7 0-3.1.6-4.3 1.7l.4.8.3.3.4.1.6-.1a25.4 25.4 0 001.2-.7 3 3 0 011-.1c.6 0 1 .2 1.3.5.3.3.4.8.4 1.5v.6c-1.1 0-2.1.2-2.9.4-.8.2-1.4.4-1.9.7-.5.3-.8.7-1 1l-.3 1.2c0 .4 0 .8.2 1.2a2.4 2.4 0 001.6 1.4l1.2.2zm.8-1.7c-.5 0-.8-.1-1-.3-.3-.2-.4-.5-.4-1l.1-.6.7-.4 1.1-.4 1.8-.2v1.8a5 5 0 01-.5.4l-.5.4-.7.2h-.6zm10.6 1.7c.5 0 1-.1 1.4-.3.5-.1.9-.4 1.2-.7l-.7-1.2-.2-.1h-.1-.2a3.5 3.5 0 01-.5.2h-.4c-.2 0-.4 0-.6-.2a1 1 0 01-.3-.8v-5.5h2.8v-1.8H465v-3.2h-1.2l-.4.1-.2.3-.5 2.8-1.6.3v1.4h1.4v5.8c0 1 .3 1.6.8 2.1s1.2.8 2 .8zm7.7 0c.5 0 1-.1 1.4-.3.5-.1 1-.4 1.3-.7l-.8-1.2-.1-.1h-.2-.2a3.5 3.5 0 01-.4.2h-.4c-.3 0-.5 0-.7-.2a1 1 0 01-.2-.8v-5.5h2.7v-1.8h-2.7v-3.2h-1.3l-.4.1-.1.3-.6 2.8-1.6.3v1l.1.4h1.4v5.8c0 1 .3 1.6.7 2.1.5.5 1.2.8 2 .8zm6.8-.2v-6.4c.3-.5.6-1 1-1.2a2 2 0 011.3-.5l.7.1h.7v-.3l.4-1.8c-.3-.2-.8-.4-1.3-.4-.6 0-1.1.2-1.6.6-.5.4-1 .9-1.3 1.5l-.2-1.2c0-.3 0-.4-.2-.5 0-.1-.2-.2-.5-.2h-1.4V128h2.4zm7.1-11.7l.6-.1a1.6 1.6 0 00.9-.9l.1-.5-.1-.7a1.6 1.6 0 00-1.5-1l-.6.2a1.6 1.6 0 00-.8.8 1.5 1.5 0 00.8 2c.2.2.4.2.6.2zm1.3 11.7v-10.3h-2.5V128h2.5zm7.8.2a4 4 0 003.2-1.6l1-1.7a8.2 8.2 0 000-4.4l-.8-1.6-1.2-1c-.5-.2-1-.3-1.6-.3a4 4 0 00-1.8.4c-.5.2-1 .6-1.3 1v-5.9H491V128h1.6l.4-.1.2-.4.1-.7.5.6a3 3 0 002.2.8zm-.6-2l-1-.1c-.4-.2-.7-.4-1-.8v-4.6l1-.9a2.5 2.5 0 012.3-.1l.6.5.4 1a6.9 6.9 0 010 3.1c-.1.5-.3.8-.5 1.1a2 2 0 01-.8.7l-1 .2zm10.2 2a4.2 4.2 0 003.2-1.5l.1.8c.1.3.4.5.7.5h1.5v-10.3h-2.5v7.5l-1 .7c-.4.2-.8.3-1.3.3s-1-.1-1.3-.5c-.2-.3-.4-.8-.4-1.4v-6.6h-2.5v6.6l.3 1.5c.1.5.3 1 .6 1.3l1.1.8c.4.2 1 .3 1.5.3zm11.5 0c.5 0 1-.1 1.4-.3.5-.1 1-.4 1.3-.7l-.8-1.2-.1-.1h-.2-.2a3.5 3.5 0 01-.4.2h-.4c-.3 0-.5 0-.7-.2a1 1 0 01-.2-.8v-5.5h2.7v-1.8h-2.7v-3.2h-1.3l-.4.1-.1.3-.6 2.8-1.6.3v1l.1.4h1.4v5.8c0 1 .3 1.6.7 2.1.5.5 1.2.8 2 .8zm8.7 0a8 8 0 002.3-.4l1-.5 1-.8-.8-1-.2-.1h-.2a1 1 0 00-.5 0 17.7 17.7 0 01-1.3.7l-1 .1c-.9 0-1.5-.2-2-.7-.5-.5-.8-1.3-.9-2.3h6.7l.1-.2.1-.3v-.5c0-.7 0-1.4-.3-2-.2-.5-.5-1-1-1.4a4 4 0 00-1.3-1 5 5 0 00-1.8-.2c-.8 0-1.5.1-2 .4a4.6 4.6 0 00-2.6 2.7c-.2.6-.3 1.3-.3 2 0 .8.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.6 1 .6.3 1.3.5 2 .5zm2.1-6.5h-4.6a3 3 0 01.8-1.7 2.3 2.3 0 012.6-.4c.3 0 .5.2.7.4l.4.8.1.9zm10.8 9.2v-1.6H531v1.6h8zM476.4 148v-6.4c.2-.5.5-1 1-1.2a2 2 0 011.2-.5l.7.1h.7l.1-.3.3-1.8c-.3-.2-.7-.3-1.2-.3-.6 0-1.2.1-1.7.5s-.9.9-1.3 1.5l-.1-1.2c0-.3-.1-.4-.2-.5l-.5-.2h-1.5V148h2.5zm9 .2a4.2 4.2 0 003.1-1.5l.2.9c0 .3.3.4.6.4h1.5v-10.3h-2.5v7.5c-.3.3-.6.6-1 .7l-1.2.3c-.6 0-1-.1-1.3-.5-.3-.3-.5-.8-.5-1.4v-6.6H482v6.6c0 .5 0 1 .2 1.5.1.5.4 1 .7 1.3l1 .8 1.5.3zm10.7-.2v-14.9h-2.5V148h2.5zm7.1.2a8 8 0 002.3-.4l1-.5 1-.8-.8-1-.1-.1h-.3a1 1 0 00-.5 0 17.7 17.7 0 01-1.3.7l-1 .1c-.9 0-1.5-.2-2-.7-.5-.5-.8-1.3-.9-2.3h6.7l.1-.2.1-.3v-.5c0-.7 0-1.4-.3-2a4 4 0 00-2.3-2.4 5 5 0 00-1.8-.2c-.8 0-1.4.1-2 .4a4.6 4.6 0 00-2.6 2.7c-.2.6-.3 1.3-.3 2 0 .9.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.6 1 .7.3 1.3.5 2 .5zm2.1-6.5h-4.6a3 3 0 01.8-1.7c.4-.4 1-.7 1.7-.7l1 .2.6.5.4.8.1.9zm6.7 6.3v-6.4c.3-.5.6-1 1-1.2a2 2 0 011.3-.5l.7.1h.7v-.3l.4-1.8c-.3-.2-.8-.3-1.3-.3-.6 0-1.1.1-1.6.5s-1 .9-1.3 1.5l-.2-1.2c0-.3 0-.4-.2-.5 0-.1-.2-.2-.5-.2h-1.4V148h2.4zM576.6 135v-14.9h-2.5V135h2.5zm7.2.2a8 8 0 002.2-.4l1-.5 1-.8-.7-1-.2-.1h-.3a1 1 0 00-.4 0 17.7 17.7 0 01-1.4.7l-1 .1c-.8 0-1.5-.2-2-.7-.5-.5-.8-1.3-.8-2.3h6.6l.2-.2v-.3-.5c0-.7 0-1.4-.2-2l-1-1.4a4 4 0 00-1.4-1 5 5 0 00-1.8-.2c-.7 0-1.4.1-2 .4a4.6 4.6 0 00-2.5 2.7c-.3.6-.4 1.2-.4 2 0 .9.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.7 1 .6.3 1.3.5 2 .5zm2-6.5h-4.6a3 3 0 01.8-1.7c.4-.4 1-.7 1.7-.7l1 .2.6.5.4.8.2.9zm6.8 6.3v-7.5c.2-.3.5-.6.8-.7a2 2 0 011-.3c.6 0 1 .2 1.3.5.2.3.4.8.4 1.5v6.5h2.4v-6.5c0-.3 0-.6.2-.9 0-.2.2-.4.4-.6l.5-.4.7-.1c.6 0 1 .2 1.3.5.3.3.4.8.4 1.5v6.5h2.5v-6.5c0-.7 0-1.2-.2-1.7s-.4-.9-.7-1.2l-1.1-.8-1.5-.2a4.4 4.4 0 00-2 .4 3.3 3.3 0 00-1.3 1.5c-.2-.6-.5-1-1-1.4-.4-.3-.9-.5-1.5-.5a3.2 3.2 0 00-2.2.8l-.6.5-.2-.7c0-.3-.3-.5-.6-.5h-1.5V135h2.5zm16.8 0v-7.5c.3-.3.6-.6.9-.7a2 2 0 011-.3c.5 0 1 .2 1.2.5.3.3.4.8.4 1.5v6.5h2.5v-6.5l.1-.9.4-.6.6-.4.7-.1c.5 0 1 .2 1.2.5.3.3.5.8.5 1.5v6.5h2.4v-6.5c0-.7 0-1.2-.2-1.7-.1-.5-.4-.9-.7-1.2l-1-.8-1.6-.2a4.4 4.4 0 00-1.9.4 3.3 3.3 0 00-1.4 1.5c-.2-.6-.5-1-1-1.4-.3-.3-.9-.5-1.5-.5a3.2 3.2 0 00-2.2.8l-.6.5-.1-.7c-.1-.3-.3-.5-.7-.5H607V135h2.5zm16.8.2l1-.1a4 4 0 001.7-.7l.7-.7.3.8c0 .2.1.3.3.4l.5.1h1.1v-6.5c0-.6 0-1-.2-1.6-.2-.5-.5-.9-.8-1.2-.3-.4-.7-.7-1.2-.9-.5-.2-1-.3-1.6-.3-1.7 0-3 .6-4.3 1.7l.5.8.3.3.4.1.6-.1a25.4 25.4 0 001.2-.7 3 3 0 011-.1c.6 0 1 .2 1.3.5.3.3.4.8.4 1.5v.6c-1.2 0-2.1.2-3 .4l-1.8.7c-.5.3-.8.7-1 1l-.3 1.2c0 .4 0 .8.2 1.2a2.4 2.4 0 001.5 1.4l1.2.2zm.8-1.7c-.4 0-.7-.1-1-.3-.2-.2-.4-.5-.4-1 0-.2 0-.4.2-.6l.6-.4 1.2-.4 1.8-.2v1.8a5 5 0 01-.5.4l-.6.4-.6.2h-.7zm10.6 1.7l1.5-.3c.4-.1.8-.3 1.2-.7l-.7-1.1-.2-.2h-.2-.1a3.5 3.5 0 01-.5.2h-.4c-.3 0-.5 0-.6-.2a1 1 0 01-.3-.8v-5.5h2.7v-1.8h-2.7v-3.2H636l-.3.1-.2.3-.5 2.8-1.7.3v1c0 .1 0 .3.2.4h1.3v5.8c0 1 .3 1.6.8 2.1.4.5 1.1.8 2 .8zm5.8-11.9l.6-.1a1.6 1.6 0 00.9-.9v-.5-.7a1.6 1.6 0 00-1.5-1l-.6.2a1.6 1.6 0 00-.8.8 1.5 1.5 0 00.8 2c.2.2.4.2.6.2zm1.2 11.7v-10.3h-2.5V135h2.5zm10 0v-1.9h-5l4.9-6.4a1.7 1.7 0 00.3-1v-1H647v2h5l-5 6.4a1.4 1.4 0 00-.2.9v1h7.9zm6.6.2a8 8 0 002.2-.4l1-.5 1-.8-.7-1-.2-.1h-.3a1 1 0 00-.5 0 17.7 17.7 0 01-1.3.7l-1 .1c-.8 0-1.5-.2-2-.7-.5-.5-.8-1.3-.8-2.3h6.6l.2-.2v-.3-.5c0-.7 0-1.4-.3-2a4 4 0 00-2.3-2.4 5 5 0 00-1.8-.2c-.7 0-1.4.1-2 .4a4.6 4.6 0 00-2.6 2.7c-.2.6-.3 1.2-.3 2 0 .9.1 1.6.4 2.3a5 5 0 001 1.7c.5.5 1 .9 1.7 1 .6.3 1.2.5 2 .5zm2-6.5h-4.6a3 3 0 01.8-1.7c.4-.4 1-.7 1.7-.7l1 .2.6.5.4.8.2.9zm6.8 6.3v-6.4c.2-.5.5-1 .9-1.2a2 2 0 011.3-.5l.7.1h.7l.1-.3.3-1.8c-.3-.2-.7-.4-1.2-.4-.6 0-1.2.2-1.7.6-.5.4-1 .9-1.3 1.5l-.1-1.2c0-.3-.1-.4-.2-.5l-.6-.2h-1.4V135h2.5z"/>
+    <path fill="#FFF" fill-rule="nonzero" d="M193.4 175v-2h-3c-.3 0-.6-.1-.8-.3-.2-.2-.3-.5-.3-.9V162h-5.4v2h3.1v7.8c0 1 .3 1.8.9 2.4.5.5 1.3.8 2.3.8h3.2zm6.3-11.4c.4 0 .7-.1 1-.3.2-.2.3-.5.3-.9 0-.3-.1-.6-.3-.8-.3-.3-.6-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.2.2-.3.5-.3.8 0 .4.1.7.3.9.3.2.6.3 1 .3h.4zm4 11.4v-2h-2.9V165h-5.3v2h3.1v5.8h-3.5v2.1h8.6zm6 .2c1.2 0 2.2-.3 2.9-.8.7-.5 1-1.3 1-2.3 0-.9-.3-1.6-.8-2-.6-.6-1.5-.9-2.6-1l-1.5-.2c-.9-.1-1.3-.5-1.3-1.1 0-.8.5-1.2 1.6-1.2h.7c.4 0 .7.1 1 .3.3.2.4.4.5.6h2.3c-.1-.8-.5-1.4-1.2-1.9-.6-.4-1.5-.7-2.6-.7h-.7c-1.3 0-2.2.3-2.9.8-.6.5-1 1.2-1 2.1 0 1 .3 1.6.9 2.1.5.5 1.3.8 2.5 1l1.4.1c1 .1 1.5.6 1.5 1.2 0 .4-.1.7-.5 1-.3.1-.7.3-1.3.3h-.7c-.4 0-.8-.1-1-.3-.4-.2-.6-.4-.7-.7H205c.2.8.6 1.5 1.2 2 .7.4 1.7.7 2.8.7h.7zm13.9-.2v-2H221a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2H220v-3h-2.3v3H215v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm6.3.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.4 5.9v-6.2c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.3.5.4.3.5.8.5 1.4v6.2h2.3v-6.5c0-1-.3-2-.9-2.6a3 3 0 00-2.4-1 3 3 0 00-1.8.5c-.4.3-.7.8-.8 1.4h-.2v-1.7h-2.1v9.9h2.2zm12.2.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.9 5.9v-6.2c0-.6.1-1 .5-1.4.3-.4.7-.6 1.3-.6.6 0 1.1.2 1.5.6.3.3.5.8.5 1.4v.6h2.3v-.8a4 4 0 00-.9-2.7c-.6-.7-1.4-1-2.5-1-.7 0-1.3.2-1.8.5s-.8.8-.9 1.4h-.2v-1.7h-2v9.9h2.2zM320.4 175v-2h-3c-.3 0-.6-.1-.8-.3-.2-.2-.3-.5-.3-.9V162h-5.4v2h3.1v7.8c0 1 .3 1.8.9 2.4.5.5 1.3.8 2.3.8h3.2zm6.3-11.4c.4 0 .7-.1 1-.3.2-.2.3-.5.3-.9 0-.3-.1-.6-.3-.8-.3-.3-.6-.4-1-.4h-.4c-.4 0-.7.1-1 .4-.2.2-.3.5-.3.8 0 .4.1.7.3.9.3.2.6.3 1 .3h.4zm4 11.4v-2h-2.9V165h-5.3v2h3.1v5.8h-3.5v2.1h8.6zm6 .2c1.2 0 2.2-.3 2.9-.8.7-.5 1-1.3 1-2.3 0-.9-.3-1.6-.8-2-.6-.6-1.5-.9-2.6-1l-1.5-.2c-.9-.1-1.3-.5-1.3-1.1 0-.8.5-1.2 1.6-1.2h.7c.4 0 .7.1 1 .3.3.2.4.4.5.6h2.3c-.1-.8-.5-1.4-1.2-1.9-.6-.4-1.5-.7-2.6-.7h-.7c-1.3 0-2.2.3-2.9.8-.6.5-1 1.2-1 2.1 0 1 .3 1.6.9 2.1.5.5 1.3.8 2.5 1l1.4.1c1 .1 1.5.6 1.5 1.2 0 .4-.1.7-.5 1-.3.1-.7.3-1.3.3h-.7c-.4 0-.8-.1-1-.3-.4-.2-.6-.4-.7-.7H332c.2.8.6 1.5 1.2 2 .7.4 1.7.7 2.8.7h.7zm13.9-.2v-2H348a1 1 0 01-.8-.3 1 1 0 01-.3-.8v-4.8h3.8v-2H347v-3h-2.3v3H342v2h2.7v4.8c0 1 .3 1.7.9 2.3.6.5 1.3.8 2.3.8h2.7zm6.3.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.4 5.9v-6.2c0-.6.2-1 .5-1.4.4-.4.8-.5 1.4-.5.6 0 1 .1 1.3.5.4.3.5.8.5 1.4v6.2h2.3v-6.5c0-1-.3-2-.9-2.6a3 3 0 00-2.4-1 3 3 0 00-1.8.5c-.4.3-.7.8-.8 1.4h-.2v-1.7h-2.1v9.9h2.2zm12.2.2c.5 0 1 0 1.5-.2l1.2-.5 1-.9.5-1.1h-2.2c-.2.3-.4.5-.8.6a3 3 0 01-1.2.3 2 2 0 01-1.5-.6c-.4-.4-.6-.9-.6-1.5v-.7h6.4v-1.8a4 4 0 00-.3-1.6c-.2-.5-.5-.9-.9-1.2a4 4 0 00-1.3-.8l-1.8-.3c-.7 0-1.3.1-1.8.3-.5.2-1 .4-1.3.8-.4.3-.7.7-.9 1.2a4 4 0 00-.3 1.6v2.5c0 .6.1 1.1.3 1.6a3.5 3.5 0 002.2 2l1.8.3zm2-6.1h-4v-.3c0-.7.1-1.2.5-1.5a2 2 0 011.5-.6 2 2 0 011.5.6c.4.3.6.8.6 1.5v.3zm6.9 5.9v-6.2c0-.6.1-1 .5-1.4.3-.4.7-.6 1.3-.6.6 0 1.1.2 1.5.6.3.3.5.8.5 1.4v.6h2.3v-.8a4 4 0 00-.9-2.7c-.6-.7-1.4-1-2.5-1-.7 0-1.3.2-1.8.5s-.8.8-.9 1.4h-.2v-1.7h-2v9.9h2.2z"/>
+    <path fill="#CCBDFE" stroke="#5E3AD7" stroke-linejoin="round" stroke-width="3.8" d="M672 90h123.5l21.5 40.9-21.5 39.1H672l21.5-39z"/>
+    <path fill="#B39DFF" stroke="#5E3AD7" stroke-linejoin="round" stroke-width="3.8" d="M817 131l-21.5 39H672l21.5-38.9z"/>
+    <path fill="#3D4251" fill-rule="nonzero" d="M727.8 121v-9.3l1.4-1c.4-.2 1-.3 1.5-.3.7 0 1.2.2 1.6.6.4.4.5 1 .5 1.8v8.2h3.1v-8.2c0-.7 0-1.3-.3-2-.1-.5-.4-1-.8-1.5-.3-.4-.8-.7-1.3-1-.6-.2-1.2-.3-1.9-.3a5.4 5.4 0 00-2.2.5 5.7 5.7 0 00-1.8 1.3l-.2-1c0-.4-.4-.6-.8-.6h-1.8V121h3zm16.6.2a10.1 10.1 0 002.8-.5 5.1 5.1 0 002.5-1.7l-.9-1-.2-.3h-.4l-.6.1a22 22 0 01-1.6.8l-1.3.2c-1 0-1.8-.3-2.5-1-.6-.5-1-1.5-1-2.8h8.3l.2-.3v-.3l.1-.6c0-1-.1-1.8-.4-2.5-.3-.7-.7-1.3-1.1-1.8-.5-.5-1.1-.9-1.8-1.1-.7-.3-1.4-.4-2.2-.4-1 0-1.8.1-2.6.5a5.7 5.7 0 00-3.2 3.3c-.3.8-.4 1.6-.4 2.5 0 1.1.2 2.1.5 3a5.6 5.6 0 003.3 3.5c.8.2 1.6.4 2.5.4zm2.6-8h-5.8c.2-1 .5-1.7 1-2.3.5-.5 1.2-.7 2.1-.7.5 0 .9 0 1.2.2l.9.6.5 1 .1 1.1zm8.4 7.8v-8a4 4 0 011.2-1.5c.4-.4 1-.6 1.6-.6l.9.1.5.1h.3l.2-.4.4-2.3c-.4-.3-1-.5-1.6-.5-.7 0-1.4.3-2 .7a6 6 0 00-1.7 1.9l-.1-1.5c0-.3-.2-.5-.3-.7l-.7-.1h-1.8V121h3.1zM716 157.1c.5 0 1 0 1.3-.2.5-.1.8-.3 1.2-.6l-.7-1-.1-.2h-.2-.2a3.2 3.2 0 01-.4.2h-.3c-.3 0-.5 0-.6-.2a1 1 0 01-.2-.7v-5h2.4v-1.6h-2.4V145h-1.2l-.3.1-.2.3-.5 2.5-1.4.3v1.2l.4.1h1v5.2c0 .8.2 1.4.6 1.9.4.4 1 .6 1.8.6zm8 0a5 5 0 001.8-.3 4 4 0 002.3-2.5c.3-.5.4-1.2.4-2 0-.7-.1-1.3-.4-2a4 4 0 00-2.4-2.4 5 5 0 00-1.8-.3c-.7 0-1.3.1-1.9.4a4 4 0 00-2.3 2.4c-.3.6-.4 1.2-.4 2 0 .7.1 1.4.4 2a4 4 0 002.4 2.5l1.8.2zm0-1.7c-.9 0-1.4-.2-1.8-.8-.4-.5-.6-1.2-.6-2.2 0-1 .2-1.8.6-2.3.4-.5 1-.8 1.7-.8.8 0 1.3.3 1.7.8.4.5.6 1.3.6 2.3 0 1-.2 1.7-.6 2.3-.4.5-1 .7-1.7.7zm8.4 1.6v-4.2h1l.2.3 2.4 3.5.3.3.4.1h2l-3-4.5-.3-.4-.4-.3c.2 0 .3-.2.4-.3l.3-.3 2.9-3.4H736l-.3.4-2.3 2.8a1 1 0 01-.3.2h-.8v-7.6h-2.2V157h2.2zm16.2 0v-1.3c0-.3 0-.4-.2-.6a.8.8 0 00-.6-.2h-3.3a7.1 7.1 0 00-1.6.3l2.9-3a52 52 0 001.8-2.1l.6-1.2a4 4 0 000-2.9 3.4 3.4 0 00-2.2-1.9c-.5-.2-1-.3-1.6-.3-.6 0-1.2.1-1.7.3a4 4 0 00-2.3 1.9l-.5 1.6 1.2.2h.3l.5-.1.3-.5c.1-.4.4-.8.7-1a2 2 0 011.3-.5l.8.1a1.6 1.6 0 011 1l.1.8-.1 1-.4 1-.7.8-.9 1-4 4a1.3 1.3 0 00-.3.9v.7h9zm6.6 0l3.7-9.2h-2.2l-.2.4-1.8 5a7.8 7.8 0 00-.5 1.6 16.4 16.4 0 00-.4-1.7l-1.8-5-.2-.2a.7.7 0 00-.5-.1h-1.8l3.7 9.2h2zm8.9.1a7.3 7.3 0 002-.3l1-.5c.3-.1.6-.4.8-.7l-.6-.8-.2-.1h-.3-.4a15.9 15.9 0 01-1.2.6 3 3 0 01-1 .1c-.6 0-1.2-.2-1.7-.7-.4-.4-.7-1-.7-2h5.9l.2-.2v-.3-.4a5 5 0 00-.2-1.8l-.8-1.3-1.3-.8a4.6 4.6 0 00-3.5 0 4.1 4.1 0 00-2.2 2.5c-.3.6-.4 1.2-.4 1.8 0 .8.2 1.5.4 2.1.2.6.6 1.2 1 1.6l1.4 1a5 5 0 001.8.2zm1.9-5.8h-4.2c.1-.7.3-1.2.7-1.5a2 2 0 011.5-.6c.4 0 .6 0 .9.2.2 0 .4.2.6.4l.4.7v.8zm7.3 5.8a6.8 6.8 0 002-.3 4.3 4.3 0 001.7-1.2l-.7-.8-.4-.2-.4.1-.4.3-.5.3-.9.1c-.3 0-.6 0-1-.2a2 2 0 01-.6-.6c-.2-.2-.4-.5-.5-1a5 5 0 01-.2-1.2c0-.5 0-1 .2-1.3 0-.4.2-.7.4-1l.8-.6 1-.2a2.5 2.5 0 011.3.4l.4.2h.6l.2-.2.6-.8-1.4-1-1.8-.3c-.8 0-1.4.1-2 .4a4 4 0 00-2.2 2.5c-.2.6-.3 1.2-.3 1.9s0 1.4.3 2c.2.6.5 1 .9 1.5l1.3 1 1.6.2z"/>
+    <g>
+      <path fill="url(#c)" fill-rule="nonzero" d="M249.5 40.5v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zM240-2h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zM6 0v1a2 2 0 104 0V0a2 2 0 10-4 0zm0 8v1a2 2 0 104 0V8a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0z" transform="translate(226 28.5)"/>
+      <path fill="#CDB217" d="M234 73l-8-16h16z"/>
+    </g>
+    <g>
+      <path fill="url(#d)" fill-rule="nonzero" d="M129.5 40.5v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zm0-8v-1a2 2 0 00-4 0v1a2 2 0 004 0zM120-2h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 000 4h1a2 2 0 000-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zm-8 0h-1a2 2 0 100 4h1a2 2 0 100-4zM6 0v1a2 2 0 104 0V0a2 2 0 10-4 0zm0 8v1a2 2 0 104 0V8a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0zm0 8v1a2 2 0 104 0v-1a2 2 0 10-4 0z" transform="translate(485 28.5)"/>
+      <path fill="#3AD787" d="M493 73l-8-16h16z"/>
+    </g>
+  </g>
+</svg>
diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index d37e9471d..69ef5fcf2 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -55,15 +55,15 @@ For a detailed compatibility overview, see the
 This is also the source of spaCy's internal compatibility check, performed when
 you run the [`download`](/api/cli#download) command.
 
-## Pretrained pipeline design {#design}
+## Trained pipeline design {#design}
 
-The spaCy v3 pretrained pipelines are designed to be efficient and configurable.
+The spaCy v3 trained pipelines are designed to be efficient and configurable.
 For example, multiple components can share a common "token-to-vector" model and
 it's easy to swap out or disable the lemmatizer. The pipelines are designed to
 be efficient in terms of speed and size and work well when the pipeline is run
 in full.
 
-When modifying a pretrained pipeline, it's important to understand how the
+When modifying a trained pipeline, it's important to understand how the
 components **depend on** each other. Unlike spaCy v2, where the `tagger`,
 `parser` and `ner` components were all independent, some v3 components depend on
 earlier components in the pipeline. As a result, disabling or reordering
@@ -84,6 +84,8 @@ Main changes from spaCy v2 models:
 
 ### CNN/CPU pipeline design
 
+![Components and their dependencies in the CNN pipelines](../images/pipeline-design.svg)
+
 In the `sm`/`md`/`lg` models:
 
 - The `tagger`, `morphologizer` and `parser` components listen to the `tok2vec`
@@ -99,11 +101,9 @@ In the `sm`/`md`/`lg` models:
   `tagger`+`attribute_ruler` or `morphologizer`.
 - The `ner` component is independent with its own internal tok2vec layer.
 
-<!-- TODO: pretty diagram -->
-
 ### Transformer pipeline design
 
-In the tranformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
+In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
 all listen to the `transformer` component. The `attribute_ruler` and
 `lemmatizer` have the same configuration as in the CNN models.
 
@@ -112,7 +112,7 @@ all listen to the `transformer` component. The `attribute_ruler` and
 ### Modifying the default pipeline
 
 For faster processing, you may only want to run a subset of the components in a
-pretrained pipeline. The `disable` and `exclude` arguments to
+trained pipeline. The `disable` and `exclude` arguments to
 [`spacy.load`](/api/top-level#spacy.load) let you control which components are
 loaded and run. Disabled components are loaded in the background so it's
 possible to reenable them in the same pipeline in the future with

From de4f4c9b8a46395b5f2be5fa80afd746c4a7f3cb Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 6 Apr 2021 14:15:21 +1000
Subject: [PATCH 077/146] Add more link anchors [ci skip]

---
 website/docs/models/index.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 69ef5fcf2..65f444cd8 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -82,7 +82,7 @@ Main changes from spaCy v2 models:
 - The lemmatizer tables and processing move from the vocab and tagger to a
   separate `lemmatizer` component.
 
-### CNN/CPU pipeline design
+### CNN/CPU pipeline design {#design-cnn}
 
 ![Components and their dependencies in the CNN pipelines](../images/pipeline-design.svg)
 
@@ -101,7 +101,7 @@ In the `sm`/`md`/`lg` models:
   `tagger`+`attribute_ruler` or `morphologizer`.
 - The `ner` component is independent with its own internal tok2vec layer.
 
-### Transformer pipeline design
+### Transformer pipeline design {#design-trf}
 
 In the transformer (`trf`) models, the `tagger`, `parser` and `ner` (if present)
 all listen to the `transformer` component. The `attribute_ruler` and
@@ -109,7 +109,7 @@ all listen to the `transformer` component. The `attribute_ruler` and
 
 <!-- TODO: pretty diagram -->
 
-### Modifying the default pipeline
+### Modifying the default pipeline {#design-modify}
 
 For faster processing, you may only want to run a subset of the components in a
 trained pipeline. The `disable` and `exclude` arguments to

From 81fd595223ced0df15318cca692fdf0d8e8f79fd Mon Sep 17 00:00:00 2001
From: graue70 <23035329+graue70@users.noreply.github.com>
Date: Thu, 8 Apr 2021 09:34:14 +0200
Subject: [PATCH 078/146] Fix __add__ method of PRFScore (#7557)

* Add failing test for PRFScore

* Fix erroneous implementation of __add__

* Simplify constructor

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 spacy/scorer.py            | 14 ++++++++++----
 spacy/tests/test_scorer.py | 22 +++++++++++++++++++++-
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index f28cb5639..8061aa329 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -20,10 +20,16 @@ MISSING_VALUES = frozenset([None, 0, ""])
 class PRFScore:
     """A precision / recall / F score."""
 
-    def __init__(self) -> None:
-        self.tp = 0
-        self.fp = 0
-        self.fn = 0
+    def __init__(
+        self,
+        *,
+        tp: int = 0,
+        fp: int = 0,
+        fn: int = 0,
+    ) -> None:
+        self.tp = tp
+        self.fp = fp
+        self.fn = fn
 
     def __len__(self) -> int:
         return self.tp + self.fp + self.fn
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 4dddca404..ecdaee768 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -3,7 +3,7 @@ import pytest
 from pytest import approx
 from spacy.training import Example
 from spacy.training.iob_utils import offsets_to_biluo_tags
-from spacy.scorer import Scorer, ROCAUCScore
+from spacy.scorer import Scorer, ROCAUCScore, PRFScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from spacy.lang.en import English
 from spacy.tokens import Doc
@@ -403,3 +403,23 @@ def test_roc_auc_score():
     score.score_set(0.75, 1)
     with pytest.raises(ValueError):
         _ = score.score  # noqa: F841
+
+
+def test_prf_score():
+    cand = {"hi", "ho"}
+    gold1 = {"yo", "hi"}
+    gold2 = set()
+
+    a = PRFScore()
+    a.score_set(cand=cand, gold=gold1)
+    assert (a.precision, a.recall, a.fscore) == approx((0.5, 0.5, 0.5))
+
+    b = PRFScore()
+    b.score_set(cand=cand, gold=gold2)
+    assert (b.precision, b.recall, b.fscore) == approx((0.0, 0.0, 0.0))
+
+    c = a + b
+    assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
+
+    a += b
+    assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore))

From ff84075839b11cb5a6df10b3931cd6e1ff199171 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 8 Apr 2021 10:08:04 +0200
Subject: [PATCH 079/146] Support large/infinite training corpora (#7208)

* Support infinite generators for training corpora

Support a training corpus with an infinite generator in the `spacy
train` training loop:

* Revert `create_train_batches` to the state where an infinite generator
can be used as the in the first epoch of exactly one epoch without
resulting in a memory leak (`max_epochs != 1` will still result in a
memory leak)
* Move the shuffling for the first epoch into the corpus reader,
renaming it to `spacy.Corpus.v2`.

* Switch to training option for shuffling in memory

Training loop:

* Add option `training.shuffle_train_corpus_in_memory` that controls
whether the corpus is loaded in memory once and shuffled in the training
loop
  * Revert changes to `create_train_batches` and rename to
`create_train_batches_with_shuffling` for use with `spacy.Corpus.v1` and
a corpus that should be loaded in memory
  * Add `create_train_batches_without_shuffling` for a corpus that
should not be shuffled in the training loop: the corpus is merely
batched during training

Corpus readers:

* Restore `spacy.Corpus.v1`
* Add `spacy.ShuffledCorpus.v1` for a corpus shuffled in memory in the
reader instead of the training loop
  * In combination with `shuffle_train_corpus_in_memory = False`, each
epoch could result in a different augmentation

* Refactor create_train_batches, validation

* Rename config setting to `training.shuffle_train_corpus`
* Refactor to use a single `create_train_batches` method with a
`shuffle` option
* Only validate `get_examples` in initialize step if:
  * labels are required
  * labels are not provided

* Switch back to max_epochs=-1 for streaming train corpus

* Use first 100 examples for stream train corpus init

* Always check validate_get_examples in initialize
---
 spacy/default_config.cfg     |  3 +++
 spacy/training/corpus.py     |  8 ++++++++
 spacy/training/initialize.py |  7 ++++++-
 spacy/training/loop.py       | 19 ++++++++++++-------
 4 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 42081f410..7f092d5f5 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -70,6 +70,9 @@ dropout = 0.1
 accumulate_gradient = 1
 # Controls early-stopping. 0 disables early stopping.
 patience = 1600
+# Number of epochs. 0 means unlimited. If >= 0, train corpus is loaded once in
+# memory and shuffled within the training loop. -1 means stream train corpus
+# rather than loading in memory with no shuffling within the training loop.
 max_epochs = 0
 max_steps = 20000
 eval_frequency = 200
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index 079b872d6..063d80a95 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -2,6 +2,7 @@ import warnings
 from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
 from typing import Optional
 from pathlib import Path
+import random
 import srsly
 
 from .. import util
@@ -96,6 +97,7 @@ class Corpus:
         Defaults to 0, which indicates no limit.
     augment (Callable[Example, Iterable[Example]]): Optional data augmentation
         function, to extrapolate additional examples from your annotations.
+    shuffle (bool): Whether to shuffle the examples.
 
     DOCS: https://spacy.io/api/corpus
     """
@@ -108,12 +110,14 @@ class Corpus:
         gold_preproc: bool = False,
         max_length: int = 0,
         augmenter: Optional[Callable] = None,
+        shuffle: bool = False,
     ) -> None:
         self.path = util.ensure_path(path)
         self.gold_preproc = gold_preproc
         self.max_length = max_length
         self.limit = limit
         self.augmenter = augmenter if augmenter is not None else dont_augment
+        self.shuffle = shuffle
 
     def __call__(self, nlp: "Language") -> Iterator[Example]:
         """Yield examples from the data.
@@ -124,6 +128,10 @@ class Corpus:
         DOCS: https://spacy.io/api/corpus#call
         """
         ref_docs = self.read_docbin(nlp.vocab, walk_corpus(self.path, FILE_TYPE))
+        if self.shuffle:
+            ref_docs = list(ref_docs)
+            random.shuffle(ref_docs)
+
         if self.gold_preproc:
             examples = self.make_examples_gold_preproc(nlp, ref_docs)
         else:
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 69861a9a9..36384d67b 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -8,6 +8,7 @@ import tarfile
 import gzip
 import zipfile
 import tqdm
+from itertools import islice
 
 from .pretrain import get_tok2vec_ref
 from ..lookups import Lookups
@@ -68,7 +69,11 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     # Make sure that listeners are defined before initializing further
     nlp._link_components()
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
-        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
+        if T["max_epochs"] == -1:
+            logger.debug("Due to streamed train corpus, using only first 100 examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labels")
+            nlp.initialize(lambda: islice(train_corpus(nlp), 100), sgd=optimizer)
+        else:
+            nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
         logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
     # Detect components with listeners that are not frozen consistently
     for name, proc in nlp.pipeline:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index a1242aea6..ecfa12fdb 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -78,7 +78,7 @@ def train(
     training_step_iterator = train_while_improving(
         nlp,
         optimizer,
-        create_train_batches(train_corpus(nlp), batcher, T["max_epochs"]),
+        create_train_batches(nlp, train_corpus, batcher, T["max_epochs"]),
         create_evaluation_callback(nlp, dev_corpus, score_weights),
         dropout=T["dropout"],
         accumulate_gradient=T["accumulate_gradient"],
@@ -290,17 +290,22 @@ def create_evaluation_callback(
 
 
 def create_train_batches(
-    iterator: Iterator[Example],
+    nlp: "Language",
+    corpus: Callable[["Language"], Iterable[Example]],
     batcher: Callable[[Iterable[Example]], Iterable[Example]],
     max_epochs: int,
 ):
     epoch = 0
-    examples = list(iterator)
-    if not examples:
-        # Raise error if no data
-        raise ValueError(Errors.E986)
+    if max_epochs >= 0:
+        examples = list(corpus(nlp))
+        if not examples:
+            # Raise error if no data
+            raise ValueError(Errors.E986)
     while max_epochs < 1 or epoch != max_epochs:
-        random.shuffle(examples)
+        if max_epochs >= 0:
+            random.shuffle(examples)
+        else:
+            examples = corpus(nlp)
         for batch in batcher(examples):
             yield epoch, batch
         epoch += 1

From ee159b8543c8d882dfa9dfad5f946269b6ff2a2c Mon Sep 17 00:00:00 2001
From: broaddeep <43122784+broaddeep@users.noreply.github.com>
Date: Thu, 8 Apr 2021 17:10:14 +0900
Subject: [PATCH 080/146] Support match alignments (#7321)

* Support match alignments

* change naming from match_alignments to with_alignments, add conditional flow if with_alignments is given, validate with_alignments, add related test case

* remove added errors, utilize bint type, cleanup whitespace

* fix no new line in end of file

* Minor formatting

* Skip alignments processing if as_spans is set

* Add with_alignments to Matcher API docs

* Update website/docs/api/matcher.md

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 .github/contributors/broaddeep.md         | 106 +++++++++++++++++
 spacy/matcher/matcher.pxd                 |   6 +
 spacy/matcher/matcher.pyx                 | 135 ++++++++++++++++++----
 spacy/tests/matcher/test_matcher_logic.py |  87 ++++++++++++++
 website/docs/api/matcher.md               |  15 +--
 5 files changed, 321 insertions(+), 28 deletions(-)
 create mode 100644 .github/contributors/broaddeep.md

diff --git a/.github/contributors/broaddeep.md b/.github/contributors/broaddeep.md
new file mode 100644
index 000000000..d6c4b3cf3
--- /dev/null
+++ b/.github/contributors/broaddeep.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Dongjun Park         |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 2021-03-06           |
+| GitHub username                | broaddeep            |
+| Website (optional)             |                      |
diff --git a/spacy/matcher/matcher.pxd b/spacy/matcher/matcher.pxd
index 52a30d94c..455f978cc 100644
--- a/spacy/matcher/matcher.pxd
+++ b/spacy/matcher/matcher.pxd
@@ -46,6 +46,12 @@ cdef struct TokenPatternC:
     int32_t nr_py
     quantifier_t quantifier
     hash_t key
+    int32_t token_idx
+
+
+cdef struct MatchAlignmentC:
+    int32_t token_idx
+    int32_t length
 
 
 cdef struct PatternStateC:
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 26dca05eb..dae12c3f6 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -196,7 +196,7 @@ cdef class Matcher:
                 else:
                     yield doc
 
-    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False, with_alignments=False):
         """Find all token sequences matching the supplied pattern.
 
         doclike (Doc or Span): The document to match over.
@@ -204,10 +204,16 @@ cdef class Matcher:
             start, end) tuples.
         allow_missing (bool): Whether to skip checks for missing annotation for
             attributes included in patterns. Defaults to False.
+        with_alignments (bool): Return match alignment information, which is
+            `List[int]` with length of matched span. Each entry denotes the
+            corresponding index of token pattern. If as_spans is set to True,
+            this setting is ignored.
         RETURNS (list): A list of `(match_id, start, end)` tuples,
             describing the matches. A match tuple describes a span
             `doc[start:end]`. The `match_id` is an integer. If as_spans is set
             to True, a list of Span objects is returned.
+            If with_alignments is set to True and as_spans is set to False,
+            A list of `(match_id, start, end, alignments)` tuples is returned.
         """
         if isinstance(doclike, Doc):
             doc = doclike
@@ -217,6 +223,9 @@ cdef class Matcher:
             length = doclike.end - doclike.start
         else:
             raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
+        # Skip alignments calculations if as_spans is set
+        if as_spans:
+            with_alignments = False
         cdef Pool tmp_pool = Pool()
         if not allow_missing:
             for attr in (TAG, POS, MORPH, LEMMA, DEP):
@@ -232,18 +241,20 @@ cdef class Matcher:
                     error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
                     raise ValueError(error_msg)
         matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
-                                extensions=self._extensions, predicates=self._extra_predicates)
+                                extensions=self._extensions, predicates=self._extra_predicates, with_alignments=with_alignments)
         final_matches = []
         pairs_by_id = {}
-        # For each key, either add all matches, or only the filtered, non-overlapping ones
-        for (key, start, end) in matches:
+        # For each key, either add all matches, or only the filtered,
+        # non-overlapping ones this `match` can be either (start, end) or
+        # (start, end, alignments) depending on `with_alignments=` option.
+        for key, *match in matches:
             span_filter = self._filter.get(key)
             if span_filter is not None:
                 pairs = pairs_by_id.get(key, [])
-                pairs.append((start,end))
+                pairs.append(match)
                 pairs_by_id[key] = pairs
             else:
-                final_matches.append((key, start, end))
+                final_matches.append((key, *match))
         matched = <char*>tmp_pool.alloc(length, sizeof(char))
         empty = <char*>tmp_pool.alloc(length, sizeof(char))
         for key, pairs in pairs_by_id.items():
@@ -255,14 +266,18 @@ cdef class Matcher:
                 sorted_pairs = sorted(pairs, key=lambda x: (x[1]-x[0], -x[0]), reverse=True) # reverse sort by length
             else:
                 raise ValueError(Errors.E947.format(expected=["FIRST", "LONGEST"], arg=span_filter))
-            for (start, end) in sorted_pairs:
+            for match in sorted_pairs:
+                start, end = match[:2]
                 assert 0 <= start < end  # Defend against segfaults
                 span_len = end-start
                 # If no tokens in the span have matched
                 if memcmp(&matched[start], &empty[start], span_len * sizeof(matched[0])) == 0:
-                    final_matches.append((key, start, end))
+                    final_matches.append((key, *match))
                     # Mark tokens that have matched
                     memset(&matched[start], 1, span_len * sizeof(matched[0]))
+        if with_alignments:
+            final_matches_with_alignments = final_matches
+            final_matches = [(key, start, end) for key, start, end, alignments in final_matches]
         # perform the callbacks on the filtered set of results
         for i, (key, start, end) in enumerate(final_matches):
             on_match = self._callbacks.get(key, None)
@@ -270,6 +285,22 @@ cdef class Matcher:
                 on_match(self, doc, i, final_matches)
         if as_spans:
             return [Span(doc, start, end, label=key) for key, start, end in final_matches]
+        elif with_alignments:
+            # convert alignments List[Dict[str, int]] --> List[int]
+            final_matches = []
+            # when multiple alignment (belongs to the same length) is found,
+            # keeps the alignment that has largest token_idx
+            for key, start, end, alignments in final_matches_with_alignments:
+                sorted_alignments = sorted(alignments, key=lambda x: (x['length'], x['token_idx']), reverse=False)
+                alignments = [0] * (end-start)
+                for align in sorted_alignments:
+                    if align['length'] >= end-start:
+                        continue
+                    # Since alignments are sorted in order of (length, token_idx)
+                    # this overwrites smaller token_idx when they have same length.
+                    alignments[align['length']] = align['token_idx']
+                final_matches.append((key, start, end, alignments))
+            return final_matches
         else:
             return final_matches
 
@@ -288,9 +319,9 @@ def unpickle_matcher(vocab, patterns, callbacks):
     return matcher
 
 
-cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple()):
+cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, extensions=None, predicates=tuple(), bint with_alignments=0):
     """Find matches in a doc, with a compiled array of patterns. Matches are
-    returned as a list of (id, start, end) tuples.
+    returned as a list of (id, start, end) tuples or (id, start, end, alignments) tuples (if with_alignments != 0)
 
     To augment the compiled patterns, we optionally also take two Python lists.
 
@@ -302,6 +333,8 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     """
     cdef vector[PatternStateC] states
     cdef vector[MatchC] matches
+    cdef vector[vector[MatchAlignmentC]] align_states
+    cdef vector[vector[MatchAlignmentC]] align_matches
     cdef PatternStateC state
     cdef int i, j, nr_extra_attr
     cdef Pool mem = Pool()
@@ -328,12 +361,14 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
     for i in range(length):
         for j in range(n):
             states.push_back(PatternStateC(patterns[j], i, 0))
-        transition_states(states, matches, predicate_cache,
-            doclike[i], extra_attr_values, predicates)
+        if with_alignments != 0:
+            align_states.resize(states.size())
+        transition_states(states, matches, align_states, align_matches, predicate_cache,
+            doclike[i], extra_attr_values, predicates, with_alignments)
         extra_attr_values += nr_extra_attr
         predicate_cache += len(predicates)
     # Handle matches that end in 0-width patterns
-    finish_states(matches, states)
+    finish_states(matches, states, align_matches, align_states, with_alignments)
     seen = set()
     for i in range(matches.size()):
         match = (
@@ -346,16 +381,22 @@ cdef find_matches(TokenPatternC** patterns, int n, object doclike, int length, e
         # first .?, or the second .? -- it doesn't matter, it's just one match.
         # Skip 0-length matches. (TODO: fix algorithm)
         if match not in seen and matches[i].length > 0:
-            output.append(match)
+            if with_alignments != 0:
+                # since the length of align_matches equals to that of match, we can share same 'i'
+                output.append(match + (align_matches[i],))
+            else:
+                output.append(match)
             seen.add(match)
     return output
 
 
 cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
+                            vector[vector[MatchAlignmentC]]& align_states, vector[vector[MatchAlignmentC]]& align_matches,
                             int8_t* cached_py_predicates,
-        Token token, const attr_t* extra_attrs, py_predicates) except *:
+        Token token, const attr_t* extra_attrs, py_predicates, bint with_alignments) except *:
     cdef int q = 0
     cdef vector[PatternStateC] new_states
+    cdef vector[vector[MatchAlignmentC]] align_new_states
     cdef int nr_predicate = len(py_predicates)
     for i in range(states.size()):
         if states[i].pattern.nr_py >= 1:
@@ -370,23 +411,39 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
         # it in the states list, because q doesn't advance.
         state = states[i]
         states[q] = state
+        # Separate from states, performance is guaranteed for users who only need basic options (without alignments).
+        # `align_states` always corresponds to `states` 1:1.
+        if with_alignments != 0:
+            align_state = align_states[i]
+            align_states[q] = align_state
         while action in (RETRY, RETRY_ADVANCE, RETRY_EXTEND):
+            # Update alignment before the transition of current state
+            # 'MatchAlignmentC' maps 'original token index of current pattern' to 'current matching length'
+            if with_alignments != 0:
+                align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
             if action == RETRY_EXTEND:
                 # This handles the 'extend'
                 new_states.push_back(
                     PatternStateC(pattern=states[q].pattern, start=state.start,
                                   length=state.length+1))
+                if with_alignments != 0:
+                    align_new_states.push_back(align_states[q])
             if action == RETRY_ADVANCE:
                 # This handles the 'advance'
                 new_states.push_back(
                     PatternStateC(pattern=states[q].pattern+1, start=state.start,
                                   length=state.length+1))
+                if with_alignments != 0:
+                    align_new_states.push_back(align_states[q])
             states[q].pattern += 1
             if states[q].pattern.nr_py != 0:
                 update_predicate_cache(cached_py_predicates,
                     states[q].pattern, token, py_predicates)
             action = get_action(states[q], token.c, extra_attrs,
                                 cached_py_predicates)
+        # Update alignment before the transition of current state
+        if with_alignments != 0:
+            align_states[q].push_back(MatchAlignmentC(states[q].pattern.token_idx, states[q].length))
         if action == REJECT:
             pass
         elif action == ADVANCE:
@@ -399,29 +456,50 @@ cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& match
                 matches.push_back(
                     MatchC(pattern_id=ent_id, start=state.start,
                             length=state.length+1))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
             elif action == MATCH_DOUBLE:
                 # push match without last token if length > 0
                 if state.length > 0:
                     matches.push_back(
                         MatchC(pattern_id=ent_id, start=state.start,
                                 length=state.length))
+                    # MATCH_DOUBLE emits matches twice,
+                    # add one more to align_matches in order to keep 1:1 relationship
+                    if with_alignments != 0:
+                        align_matches.push_back(align_states[q])
                 # push match with last token
                 matches.push_back(
                     MatchC(pattern_id=ent_id, start=state.start,
                             length=state.length+1))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
             elif action == MATCH_REJECT:
                 matches.push_back(
                     MatchC(pattern_id=ent_id, start=state.start,
                             length=state.length))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
             elif action == MATCH_EXTEND:
                 matches.push_back(
                     MatchC(pattern_id=ent_id, start=state.start,
                            length=state.length))
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_states[q])
                 states[q].length += 1
                 q += 1
     states.resize(q)
     for i in range(new_states.size()):
         states.push_back(new_states[i])
+    # `align_states` always corresponds to `states` 1:1
+    if with_alignments != 0:
+        align_states.resize(q)
+        for i in range(align_new_states.size()):
+            align_states.push_back(align_new_states[i])
 
 
 cdef int update_predicate_cache(int8_t* cache,
@@ -444,15 +522,27 @@ cdef int update_predicate_cache(int8_t* cache,
                 raise ValueError(Errors.E125.format(value=result))
 
 
-cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states) except *:
+cdef void finish_states(vector[MatchC]& matches, vector[PatternStateC]& states,
+                        vector[vector[MatchAlignmentC]]& align_matches,
+                        vector[vector[MatchAlignmentC]]& align_states,
+                        bint with_alignments) except *:
     """Handle states that end in zero-width patterns."""
     cdef PatternStateC state
+    cdef vector[MatchAlignmentC] align_state
     for i in range(states.size()):
         state = states[i]
+        if with_alignments != 0:
+            align_state = align_states[i]
         while get_quantifier(state) in (ZERO_PLUS, ZERO_ONE):
+            # Update alignment before the transition of current state
+            if with_alignments != 0:
+                align_state.push_back(MatchAlignmentC(state.pattern.token_idx, state.length))
             is_final = get_is_final(state)
             if is_final:
                 ent_id = get_ent_id(state.pattern)
+                # `align_matches` always corresponds to `matches` 1:1
+                if with_alignments != 0:
+                    align_matches.push_back(align_state)
                 matches.push_back(
                     MatchC(pattern_id=ent_id, start=state.start, length=state.length))
                 break
@@ -607,7 +697,7 @@ cdef int8_t get_quantifier(PatternStateC state) nogil:
 cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs) except NULL:
     pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
     cdef int i, index
-    for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs):
+    for i, (quantifier, spec, extensions, predicates, token_idx) in enumerate(token_specs):
         pattern[i].quantifier = quantifier
         # Ensure attrs refers to a null pointer if nr_attr == 0
         if len(spec) > 0:
@@ -628,6 +718,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
             pattern[i].py_predicates[j] = index
         pattern[i].nr_py = len(predicates)
         pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
+        pattern[i].token_idx = token_idx
     i = len(token_specs)
     # Use quantifier to identify final ID pattern node (rather than previous
     # uninitialized quantifier == 0/ZERO + nr_attr == 0 + non-zero-length attrs)
@@ -638,6 +729,7 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
     pattern[i].nr_attr = 1
     pattern[i].nr_extra_attr = 0
     pattern[i].nr_py = 0
+    pattern[i].token_idx = -1
     return pattern
 
 
@@ -655,7 +747,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
     """This function interprets the pattern, converting the various bits of
     syntactic sugar before we compile it into a struct with init_pattern.
 
-    We need to split the pattern up into three parts:
+    We need to split the pattern up into four parts:
     * Normal attribute/value pairs, which are stored on either the token or lexeme,
         can be handled directly.
     * Extension attributes are handled specially, as we need to prefetch the
@@ -664,13 +756,14 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
         functions and store them. So we store these specially as well.
     * Extension attributes that have extra predicates are stored within the
         extra_predicates.
+    * Token index that this pattern belongs to.
     """
     tokens = []
     string_store = vocab.strings
-    for spec in token_specs:
+    for token_idx, spec in enumerate(token_specs):
         if not spec:
             # Signifier for 'any token'
-            tokens.append((ONE, [(NULL_ATTR, 0)], [], []))
+            tokens.append((ONE, [(NULL_ATTR, 0)], [], [], token_idx))
             continue
         if not isinstance(spec, dict):
             raise ValueError(Errors.E154.format())
@@ -679,7 +772,7 @@ def _preprocess_pattern(token_specs, vocab, extensions_table, extra_predicates):
         extensions = _get_extensions(spec, string_store, extensions_table)
         predicates = _get_extra_predicates(spec, extra_predicates, vocab)
         for op in ops:
-            tokens.append((op, list(attr_values), list(extensions), list(predicates)))
+            tokens.append((op, list(attr_values), list(extensions), list(predicates), token_idx))
     return tokens
 
 
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index 5f4c2991a..9f575fe05 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -204,3 +204,90 @@ def test_matcher_remove():
     # removing again should throw an error
     with pytest.raises(ValueError):
         matcher.remove("Rule")
+
+
+def test_matcher_with_alignments_greedy_longest(en_vocab):
+    cases = [
+        ("aaab", "a* b", [0, 0, 0, 1]),
+        ("baab", "b a* b", [0, 1, 1, 2]),
+        ("aaab", "a a a b", [0, 1, 2, 3]),
+        ("aaab", "a+ b", [0, 0, 0, 1]),
+        ("aaba", "a+ b a+", [0, 0, 1, 2]),
+        ("aabaa", "a+ b a+", [0, 0, 1, 2, 2]),
+        ("aaba", "a+ b a*", [0, 0, 1, 2]),
+        ("aaaa", "a*", [0, 0, 0, 0]),
+        ("baab", "b a* b b*", [0, 1, 1, 2]),
+        ("aabb", "a* b* a*", [0, 0, 1, 1]),
+        ("aaab", "a+ a+ a b", [0, 1, 2, 3]),
+        ("aaab", "a+ a+ a+ b", [0, 1, 2, 3]),
+        ("aaab", "a+ a a b", [0, 1, 2, 3]),
+        ("aaab", "a+ a a", [0, 1, 2]),
+        ("aaab", "a+ a a?", [0, 1, 2]),
+        ("aaaa", "a a a a a?", [0, 1, 2, 3]),
+        ("aaab", "a+ a b", [0, 0, 1, 2]),
+        ("aaab", "a+ a+ b", [0, 0, 1, 2]),
+    ]
+    for string, pattern_str, result in cases:
+        matcher = Matcher(en_vocab)
+        doc = Doc(matcher.vocab, words=list(string))
+        pattern = []
+        for part in pattern_str.split():
+            if part.endswith("+"):
+                pattern.append({"ORTH": part[0], "OP": "+"})
+            elif part.endswith("*"):
+                pattern.append({"ORTH": part[0], "OP": "*"})
+            elif part.endswith("?"):
+                pattern.append({"ORTH": part[0], "OP": "?"})
+            else:
+                pattern.append({"ORTH": part})
+        matcher.add("PATTERN", [pattern], greedy="LONGEST")
+        matches = matcher(doc, with_alignments=True)
+        n_matches = len(matches)
+
+        _, s, e, expected = matches[0]
+
+        assert expected == result, (string, pattern_str, s, e, n_matches)
+
+
+def test_matcher_with_alignments_nongreedy(en_vocab):
+    cases = [
+        (0, "aaab", "a* b", [[0, 1], [0, 0, 1], [0, 0, 0, 1], [1]]),
+        (1, "baab", "b a* b", [[0, 1, 1, 2]]),
+        (2, "aaab", "a a a b", [[0, 1, 2, 3]]),
+        (3, "aaab", "a+ b",   [[0, 1], [0, 0, 1], [0, 0, 0, 1]]),
+        (4, "aaba", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2]]),
+        (5, "aabaa", "a+ b a+", [[0, 1, 2], [0, 0, 1, 2], [0, 0, 1, 2, 2], [0, 1, 2, 2] ]),
+        (6, "aaba", "a+ b a*", [[0, 1], [0, 0, 1], [0, 0, 1, 2], [0, 1, 2]]),
+        (7, "aaaa", "a*", [[0], [0, 0], [0, 0, 0], [0, 0, 0, 0]]),
+        (8, "baab", "b a* b b*", [[0, 1, 1, 2]]),
+        (9, "aabb", "a* b* a*", [[1], [2], [2, 2], [0, 1], [0, 0, 1], [0, 0, 1, 1], [0, 1, 1], [1, 1]]),
+        (10, "aaab", "a+ a+ a b", [[0, 1, 2, 3]]),
+        (11, "aaab", "a+ a+ a+ b", [[0, 1, 2, 3]]),
+        (12, "aaab", "a+ a a b", [[0, 1, 2, 3]]),
+        (13, "aaab", "a+ a a", [[0, 1, 2]]),
+        (14, "aaab", "a+ a a?", [[0, 1], [0, 1, 2]]),
+        (15, "aaaa", "a a a a a?", [[0, 1, 2, 3]]),
+        (16, "aaab", "a+ a b", [[0, 1, 2], [0, 0, 1, 2]]),
+        (17, "aaab", "a+ a+ b", [[0, 1, 2], [0, 0, 1, 2]]),
+    ]
+    for case_id, string, pattern_str, results in cases:
+        matcher = Matcher(en_vocab)
+        doc = Doc(matcher.vocab, words=list(string))
+        pattern = []
+        for part in pattern_str.split():
+            if part.endswith("+"):
+                pattern.append({"ORTH": part[0], "OP": "+"})
+            elif part.endswith("*"):
+                pattern.append({"ORTH": part[0], "OP": "*"})
+            elif part.endswith("?"):
+                pattern.append({"ORTH": part[0], "OP": "?"})
+            else:
+                pattern.append({"ORTH": part})
+
+        matcher.add("PATTERN", [pattern])
+        matches = matcher(doc, with_alignments=True)
+        n_matches = len(matches)
+
+        for _, s, e, expected in matches:
+            assert expected in results, (case_id, string, pattern_str, s, e, n_matches)
+            assert len(expected) == e - s
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 95a76586a..c15ee7a47 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -120,13 +120,14 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 > matches = matcher(doc)
 > ```
 
-| Name                                       | Description                                                                                                                                                                                                                                                                                              |
-| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `doclike`                                  | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
-| _keyword-only_                             |                                                                                                                                                                                                                                                                                                          |
-| `as_spans` <Tag variant="new">3</Tag>      | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
-| `allow_missing` <Tag variant="new">3</Tag> | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
-| **RETURNS**                                | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
+| Name                                           | Description                                                                                                                                                                                                                                                                                              |
+| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `doclike`                                      | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~                                                                                                                                                                                                                                                  |
+| _keyword-only_                                 |                                                                                                                                                                                                                                                                                                          |
+| `as_spans` <Tag variant="new">3</Tag>          | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
+| `allow_missing` <Tag variant="new">3</Tag>     | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
+| `with_alignments` <Tag variant="new">3.1</Tag> | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~                             |
+| **RETURNS**                                    | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
 
 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
 

From 82d3caf8611b60a73471c73e9b0993fea06b32cd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 8 Apr 2021 10:21:22 +0200
Subject: [PATCH 081/146] Implement replace_listeners for source in config
 (#7620)

Implement replace_listeners for sourced components loaded from a config.
---
 spacy/language.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 04a5e843e..68bd3cd4c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1673,7 +1673,16 @@ class Language:
                         # model with the same vocab as the current nlp object
                         source_nlps[model] = util.load_model(model, vocab=nlp.vocab)
                     source_name = pipe_cfg.get("component", pipe_name)
+                    listeners_replaced = False
+                    if "replace_listeners" in pipe_cfg:
+                        for name, proc in source_nlps[model].pipeline:
+                            if source_name in getattr(proc, "listening_components", []):
+                                source_nlps[model].replace_listeners(name, source_name, pipe_cfg["replace_listeners"])
+                                listeners_replaced = True
                     nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
+                    # Delete from cache if listeners were replaced
+                    if listeners_replaced:
+                        del source_nlps[model]
         disabled_pipes = [*config["nlp"]["disabled"], *disable]
         nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
         nlp.batch_size = config["nlp"]["batch_size"]

From c362006cb982543f8093050bb91c71bd591b7fbe Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 8 Apr 2021 17:24:52 +0900
Subject: [PATCH 082/146] Fix is_sent_start when converting from JSON (fix
 #7635) (#7655)

Data in the JSON format is split into sentences, and each sentence is
saved with is_sent_start flags. Currently the flags are 1 for the first
token and 0 for the others. When deserialized this results in a pattern
of True, None, None, None... which makes single-sentence documents look
as though they haven't had sentence boundaries set.

Since items saved in JSON format have been split into sentences already,
the is_sent_start values should all be True or False.
---
 spacy/training/gold_io.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/training/gold_io.pyx b/spacy/training/gold_io.pyx
index 327748d01..69654e2c7 100644
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@@ -121,7 +121,7 @@ def json_to_annotations(doc):
                 if i == 0:
                     sent_starts.append(1)
                 else:
-                    sent_starts.append(0)
+                    sent_starts.append(-1)
             if "brackets" in sent:
                 brackets.extend((b["first"] + sent_start_i,
                                  b["last"] + sent_start_i, b["label"])

From 204c2f116bd74c9a54d045742a33591fb36fb6d9 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 8 Apr 2021 12:19:17 +0200
Subject: [PATCH 083/146] Extend score_spans for overlapping & non-labeled
 spans (#7209)

* extend span scorer with consider_label and allow_overlap

* unit test for spans y2x overlap

* add score_spans unit test

* docs for new fields in scorer.score_spans

* rename to include_label

* spell out if-else for clarity

* rename to 'labeled'

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/scorer.py                       | 52 +++++++++++++++++----------
 spacy/tests/test_scorer.py            | 49 +++++++++++++++++++++++--
 spacy/tests/training/test_training.py | 23 ++++++++++++
 spacy/training/example.pyx            | 15 ++++----
 website/docs/api/example.md           | 32 +++++++++--------
 website/docs/api/scorer.md            | 18 +++++-----
 6 files changed, 139 insertions(+), 50 deletions(-)

diff --git a/spacy/scorer.py b/spacy/scorer.py
index 8061aa329..25df44f14 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -311,6 +311,8 @@ class Scorer:
         *,
         getter: Callable[[Doc, str], Iterable[Span]] = getattr,
         has_annotation: Optional[Callable[[Doc], bool]] = None,
+        labeled: bool = True,
+        allow_overlap: bool = False,
         **cfg,
     ) -> Dict[str, Any]:
         """Returns PRF scores for labeled spans.
@@ -323,6 +325,11 @@ class Scorer:
         has_annotation (Optional[Callable[[Doc], bool]]) should return whether a `Doc`
             has annotation for this `attr`. Docs without annotation are skipped for
             scoring purposes.
+        labeled (bool): Whether or not to include label information in
+            the evaluation. If set to 'False', two spans will be considered
+            equal if their start and end match, irrespective of their label.
+        allow_overlap (bool): Whether or not to allow overlapping spans.
+            If set to 'False', the alignment will automatically resolve conflicts.
         RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
             the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
 
@@ -351,33 +358,42 @@ class Scorer:
             gold_spans = set()
             pred_spans = set()
             for span in getter(gold_doc, attr):
-                gold_span = (span.label_, span.start, span.end - 1)
+                if labeled:
+                    gold_span = (span.label_, span.start, span.end - 1)
+                else:
+                    gold_span = (span.start, span.end - 1)
                 gold_spans.add(gold_span)
-                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+                gold_per_type[span.label_].add(gold_span)
             pred_per_type = {label: set() for label in labels}
-            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
-                pred_spans.add((span.label_, span.start, span.end - 1))
-                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
+            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr), allow_overlap):
+                if labeled:
+                    pred_span = (span.label_, span.start, span.end - 1)
+                else:
+                    pred_span = (span.start, span.end - 1)
+                pred_spans.add(pred_span)
+                pred_per_type[span.label_].add(pred_span)
             # Scores per label
-            for k, v in score_per_type.items():
-                if k in pred_per_type:
-                    v.score_set(pred_per_type[k], gold_per_type[k])
+            if labeled:
+                for k, v in score_per_type.items():
+                    if k in pred_per_type:
+                        v.score_set(pred_per_type[k], gold_per_type[k])
             # Score for all labels
             score.score_set(pred_spans, gold_spans)
-        if len(score) > 0:
-            return {
-                f"{attr}_p": score.precision,
-                f"{attr}_r": score.recall,
-                f"{attr}_f": score.fscore,
-                f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
-            }
-        else:
-            return {
+        # Assemble final result
+        final_scores = {
                 f"{attr}_p": None,
                 f"{attr}_r": None,
                 f"{attr}_f": None,
-                f"{attr}_per_type": None,
             }
+        if labeled:
+            final_scores[f"{attr}_per_type"] = None
+        if len(score) > 0:
+            final_scores[f"{attr}_p"] = score.precision
+            final_scores[f"{attr}_r"] = score.recall
+            final_scores[f"{attr}_f"] = score.fscore
+            if labeled:
+                final_scores[f"{attr}_per_type"] = {k: v.to_dict() for k, v in score_per_type.items()}
+        return final_scores
 
     @staticmethod
     def score_cats(
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index ecdaee768..c044d8afe 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -6,7 +6,7 @@ from spacy.training.iob_utils import offsets_to_biluo_tags
 from spacy.scorer import Scorer, ROCAUCScore, PRFScore
 from spacy.scorer import _roc_auc_score, _roc_curve
 from spacy.lang.en import English
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 
 
 test_las_apple = [
@@ -405,6 +405,51 @@ def test_roc_auc_score():
         _ = score.score  # noqa: F841
 
 
+def test_score_spans():
+    nlp = English()
+    text = "This is just a random sentence."
+    key = "my_spans"
+    gold = nlp.make_doc(text)
+    pred = nlp.make_doc(text)
+    spans = []
+    spans.append(gold.char_span(0, 4, label="PERSON"))
+    spans.append(gold.char_span(0, 7, label="ORG"))
+    spans.append(gold.char_span(8, 12, label="ORG"))
+    gold.spans[key] = spans
+
+    def span_getter(doc, span_key):
+        return doc.spans[span_key]
+
+    # Predict exactly the same, but overlapping spans will be discarded
+    pred.spans[key] = spans
+    eg = Example(pred, gold)
+    scores = Scorer.score_spans([eg], attr=key, getter=span_getter)
+    assert scores[f"{key}_p"] == 1.0
+    assert scores[f"{key}_r"] < 1.0
+
+    # Allow overlapping, now both precision and recall should be 100%
+    pred.spans[key] = spans
+    eg = Example(pred, gold)
+    scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
+    assert scores[f"{key}_p"] == 1.0
+    assert scores[f"{key}_r"] == 1.0
+
+    # Change the predicted labels
+    new_spans = [Span(pred, span.start, span.end, label="WRONG") for span in spans]
+    pred.spans[key] = new_spans
+    eg = Example(pred, gold)
+    scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True)
+    assert scores[f"{key}_p"] == 0.0
+    assert scores[f"{key}_r"] == 0.0
+    assert f"{key}_per_type" in scores
+
+    # Discard labels from the evaluation
+    scores = Scorer.score_spans([eg], attr=key, getter=span_getter, allow_overlap=True, labeled=False)
+    assert scores[f"{key}_p"] == 1.0
+    assert scores[f"{key}_r"] == 1.0
+    assert f"{key}_per_type" not in scores
+
+
 def test_prf_score():
     cand = {"hi", "ho"}
     gold1 = {"yo", "hi"}
@@ -422,4 +467,4 @@ def test_prf_score():
     assert (c.precision, c.recall, c.fscore) == approx((0.25, 0.5, 0.33333333))
 
     a += b
-    assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore))
+    assert (a.precision, a.recall, a.fscore) == approx((c.precision, c.recall, c.fscore))
\ No newline at end of file
diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py
index c7a85bf87..321c08c1e 100644
--- a/spacy/tests/training/test_training.py
+++ b/spacy/tests/training/test_training.py
@@ -426,6 +426,29 @@ def test_aligned_spans_x2y(en_vocab, en_tokenizer):
     assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2), (4, 6)]
 
 
+def test_aligned_spans_y2x_overlap(en_vocab, en_tokenizer):
+    text = "I flew to San Francisco Valley"
+    nlp = English()
+    doc = nlp(text)
+    # the reference doc has overlapping spans
+    gold_doc = nlp.make_doc(text)
+    spans = []
+    prefix = "I flew to "
+    spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco"), label="CITY"))
+    spans.append(gold_doc.char_span(len(prefix), len(prefix + "San Francisco Valley"), label="VALLEY"))
+    spans_key = "overlap_ents"
+    gold_doc.spans[spans_key] = spans
+    example = Example(doc, gold_doc)
+    spans_gold = example.reference.spans[spans_key]
+    assert [(ent.start, ent.end) for ent in spans_gold] == [(3, 5), (3, 6)]
+
+    # Ensure that 'get_aligned_spans_y2x' has the aligned entities correct
+    spans_y2x_no_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=False)
+    assert [(ent.start, ent.end) for ent in spans_y2x_no_overlap] == [(3, 5)]
+    spans_y2x_overlap = example.get_aligned_spans_y2x(spans_gold, allow_overlap=True)
+    assert [(ent.start, ent.end) for ent in spans_y2x_overlap] == [(3, 5), (3, 6)]
+
+
 def test_gold_ner_missing_tags(en_tokenizer):
     doc = en_tokenizer("I flew to Silicon Valley via London.")
     biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 9cf825bf9..74af793bd 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -213,18 +213,19 @@ cdef class Example:
         else:
             return [None] * len(self.x)
 
-    def get_aligned_spans_x2y(self, x_spans):
-        return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y)
+    def get_aligned_spans_x2y(self, x_spans, allow_overlap=False):
+        return self._get_aligned_spans(self.y, x_spans, self.alignment.x2y, allow_overlap)
 
-    def get_aligned_spans_y2x(self, y_spans):
-        return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
+    def get_aligned_spans_y2x(self, y_spans, allow_overlap=False):
+        return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x, allow_overlap)
 
-    def _get_aligned_spans(self, doc, spans, align):
+    def _get_aligned_spans(self, doc, spans, align, allow_overlap):
         seen = set()
         output = []
         for span in spans:
             indices = align[span.start : span.end].data.ravel()
-            indices = [idx for idx in indices if idx not in seen]
+            if not allow_overlap:
+                indices = [idx for idx in indices if idx not in seen]
             if len(indices) >= 1:
                 aligned_span = Span(doc, indices[0], indices[-1] + 1, label=span.label)
                 target_text = span.text.lower().strip().replace(" ", "")
@@ -237,7 +238,7 @@ cdef class Example:
     def get_aligned_ner(self):
         if not self.y.has_annotation("ENT_IOB"):
             return [None] * len(self.x)  # should this be 'missing' instead of 'None' ?
-        x_ents = self.get_aligned_spans_y2x(self.y.ents)
+        x_ents = self.get_aligned_spans_y2x(self.y.ents, allow_overlap=False)
         # Default to 'None' for missing values
         x_tags = offsets_to_biluo_tags(
             self.x,
diff --git a/website/docs/api/example.md b/website/docs/api/example.md
index 2811f4d91..ca9d3c056 100644
--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@@ -33,8 +33,8 @@ both documents.
 
 | Name           | Description                                                                                                              |
 | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~                                                |
-| `reference`    | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~                                            |
+| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~                                                 |
+| `reference`    | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~                                             |
 | _keyword-only_ |                                                                                                                          |
 | `alignment`    | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
 
@@ -56,11 +56,11 @@ see the [training format documentation](/api/data-formats#dict-input).
 > example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
 > ```
 
-| Name           | Description                                                               |
-| -------------- | ------------------------------------------------------------------------- |
-| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
-| `example_dict` | `Dict[str, obj]`                                                          | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ |
-| **RETURNS**    | The newly constructed object. ~~Example~~                                 |
+| Name           | Description                                                                         |
+| -------------- | ----------------------------------------------------------------------------------- |
+| `predicted`    | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~            |
+| `example_dict` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ |
+| **RETURNS**    | The newly constructed object. ~~Example~~                                           |
 
 ## Example.text {#text tag="property"}
 
@@ -211,10 +211,11 @@ align to the tokenization in [`Example.predicted`](/api/example#predicted).
 > assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)]
 > ```
 
-| Name        | Description                                                                   |
-| ----------- | ----------------------------------------------------------------------------- |
-| `y_spans`   | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~ |
-| **RETURNS** | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~     |
+| Name            | Description                                                                                  |
+| --------------- | -------------------------------------------------------------------------------------------- |
+| `y_spans`       | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~                |
+| `allow_overlap` | Whether the resulting `Span` objects may overlap or not. Set to `False` by default. ~~bool~~ |
+| **RETURNS**     | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~                    |
 
 ## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"}
 
@@ -238,10 +239,11 @@ against the original gold-standard annotation.
 > assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)]
 > ```
 
-| Name        | Description                                                                   |
-| ----------- | ----------------------------------------------------------------------------- |
-| `x_spans`   | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~ |
-| **RETURNS** | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~     |
+| Name            | Description                                                                                  |
+| --------------- | -------------------------------------------------------------------------------------------- |
+| `x_spans`       | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~                |
+| `allow_overlap` | Whether the resulting `Span` objects may overlap or not. Set to `False` by default. ~~bool~~ |
+| **RETURNS**     | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~                    |
 
 ## Example.to_dict {#to_dict tag="method"}
 
diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md
index cf1a1ca1f..7398bae81 100644
--- a/website/docs/api/scorer.md
+++ b/website/docs/api/scorer.md
@@ -137,14 +137,16 @@ Returns PRF scores for labeled or unlabeled spans.
 > print(scores["ents_f"])
 > ```
 
-| Name             | Description                                                                                                                                                                                                        |
-| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                                                |
-| `attr`           | The attribute to score. ~~str~~                                                                                                                                                                                    |
-| _keyword-only_   |                                                                                                                                                                                                                    |
-| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                                         |
-| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~Optional[Callable[[Doc], bool]]~~ |
-| **RETURNS**      | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~                        |
+| Name             | Description                                                                                                                                                                                 |
+| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`       | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~                                                                         |
+| `attr`           | The attribute to score. ~~str~~                                                                                                                                                             |
+| _keyword-only_   |                                                                                                                                                                                             |
+| `getter`         | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~                                  |
+| `has_annotation` | Defaults to `None`. If provided, `has_annotation(doc)` should return whether a `Doc` has annotation for this `attr`. Docs without annotation are skipped for scoring purposes. ~~str~~      |
+| `labeled`        | Defaults to `True`. If set to `False`, two spans will be considered equal if their start and end match, irrespective of their label. ~~bool~~                                               |
+| `allow_overlap`  | Defaults to `False`. Whether or not to allow overlapping spans. If set to `False`, the alignment will automatically resolve conflicts. ~~bool~~                                             |
+| **RETURNS**      | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
 
 ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
 

From e6b7600adf70e5586b19d938d3a4ba7b12244f44 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 8 Apr 2021 12:25:03 +0200
Subject: [PATCH 084/146] Fix parser sourcing in NER converter (#7631)

---
 spacy/training/converters/conll_ner_to_docs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/spacy/training/converters/conll_ner_to_docs.py b/spacy/training/converters/conll_ner_to_docs.py
index 8c1bad9ea..28b21c5f0 100644
--- a/spacy/training/converters/conll_ner_to_docs.py
+++ b/spacy/training/converters/conll_ner_to_docs.py
@@ -124,6 +124,9 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None):
         nlp = load_model(model)
         if "parser" in nlp.pipe_names:
             msg.info(f"Segmenting sentences with parser from model '{model}'.")
+            for name, proc in nlp.pipeline:
+                if "parser" in getattr(proc, "listening_components", []):
+                    nlp.replace_listeners(name, "parser", ["model.tok2vec"])
             sentencizer = nlp.get_pipe("parser")
     if not sentencizer:
         msg.info(

From 3e5bd5055e8cd2198f4432fa765be11bb2f47ddd Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 8 Apr 2021 12:25:42 +0200
Subject: [PATCH 085/146] expand quickstart widget with cuda 11.1 and 11.2
 (#7615)

---
 website/src/widgets/quickstart-install.js | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js
index 492d09605..8ed602b72 100644
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@@ -17,6 +17,8 @@ const CUDA = {
     '10.1': 'cuda101',
     '10.2': 'cuda102',
     '11.0': 'cuda110',
+    '11.1': 'cuda111',
+    '11.2': 'cuda112',
 }
 const LANG_EXTRAS = ['ja'] // only for languages with models
 

From 8008e2f75b93505734018a072e4131650459ec3a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 8 Apr 2021 13:22:38 +0200
Subject: [PATCH 086/146] Use morph hash in lemmatizer cache key (#7690)

Use the morph hash rather than the `MorphAnalysis` object in the cache
key so that the `Lemmatizer` can be pickled.
---
 spacy/pipeline/lemmatizer.py            | 2 +-
 spacy/tests/pipeline/test_lemmatizer.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index 21f1a8a8b..cfe405efa 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -175,7 +175,7 @@ class Lemmatizer(Pipe):
 
         DOCS: https://spacy.io/api/lemmatizer#rule_lemmatize
         """
-        cache_key = (token.orth, token.pos, token.morph)
+        cache_key = (token.orth, token.pos, token.morph.key)
         if cache_key in self.cache:
             return self.cache[cache_key]
         string = token.text
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 1943d3dd7..3c16d3bcb 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -1,6 +1,7 @@
 import pytest
 import logging
 import mock
+import pickle
 from spacy import util, registry
 from spacy.lang.en import English
 from spacy.lookups import Lookups
@@ -106,6 +107,9 @@ def test_lemmatizer_serialize(nlp):
     doc2 = nlp2.make_doc("coping")
     doc2[0].pos_ = "VERB"
     assert doc2[0].lemma_ == ""
-    doc2 = lemmatizer(doc2)
+    doc2 = lemmatizer2(doc2)
     assert doc2[0].text == "coping"
     assert doc2[0].lemma_ == "cope"
+
+    # Make sure that lemmatizer cache can be pickled
+    b = pickle.dumps(lemmatizer2)

From 25168968495fd8ba7485e50e786434fa17fc97fa Mon Sep 17 00:00:00 2001
From: Stanislav Schmidt <Stannislav@users.noreply.github.com>
Date: Fri, 9 Apr 2021 11:53:13 +0200
Subject: [PATCH 087/146] Make vocab update in get_docs deterministic (#7603)

* Make vocab update in get_docs deterministic

The attribute `DocBin.strings` is a set. In `DocBin.get_docs`
a given vocab is updated by iterating over this set.
Iteration over a python set produces an arbitrary ordering,
therefore vocab is updated non-deterministically.

When training (fine-tuning) a spacy model, the base model's
vocabulary will be updated with the new vocabulary in the
training data in exactly the way described above. After
serialization, the file `model/vocab/strings.json` will
be sorted in an arbitrary way. This prevents reproducible
model training.

* Revert "Make vocab update in get_docs deterministic"

This reverts commit d6b87a2f558b52d66549b6a66c0af00e283ad628.

* Sort strings in StringStore serialization

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 spacy/strings.pyx                             |  4 ++--
 .../serialize/test_serialize_vocab_strings.py | 20 +++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 6a1d68221..4a20cb8af 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -223,7 +223,7 @@ cdef class StringStore:
             it doesn't exist. Paths may be either strings or Path-like objects.
         """
         path = util.ensure_path(path)
-        strings = list(self)
+        strings = sorted(self)
         srsly.write_json(path, strings)
 
     def from_disk(self, path):
@@ -247,7 +247,7 @@ cdef class StringStore:
 
         RETURNS (bytes): The serialized form of the `StringStore` object.
         """
-        return srsly.json_dumps(list(self))
+        return srsly.json_dumps(sorted(self))
 
     def from_bytes(self, bytes_data, **kwargs):
         """Load state from a binary string.
diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py
index 45a546203..3fe9363bf 100644
--- a/spacy/tests/serialize/test_serialize_vocab_strings.py
+++ b/spacy/tests/serialize/test_serialize_vocab_strings.py
@@ -49,9 +49,9 @@ def test_serialize_vocab_roundtrip_disk(strings1, strings2):
         vocab1_d = Vocab().from_disk(file_path1)
         vocab2_d = Vocab().from_disk(file_path2)
         # check strings rather than lexemes, which are only reloaded on demand
-        assert strings1 == [s for s in vocab1_d.strings]
-        assert strings2 == [s for s in vocab2_d.strings]
-        if strings1 == strings2:
+        assert set(strings1) == set([s for s in vocab1_d.strings])
+        assert set(strings2) == set([s for s in vocab2_d.strings])
+        if set(strings1) == set(strings2):
             assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
         else:
             assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
@@ -96,7 +96,7 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
     sstore2 = StringStore(strings=strings2)
     sstore1_b = sstore1.to_bytes()
     sstore2_b = sstore2.to_bytes()
-    if strings1 == strings2:
+    if set(strings1) == set(strings2):
         assert sstore1_b == sstore2_b
     else:
         assert sstore1_b != sstore2_b
@@ -104,7 +104,7 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
     assert sstore1.to_bytes() == sstore1_b
     new_sstore1 = StringStore().from_bytes(sstore1_b)
     assert new_sstore1.to_bytes() == sstore1_b
-    assert list(new_sstore1) == strings1
+    assert set(new_sstore1) == set(strings1)
 
 
 @pytest.mark.parametrize("strings1,strings2", test_strings)
@@ -118,12 +118,12 @@ def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
         sstore2.to_disk(file_path2)
         sstore1_d = StringStore().from_disk(file_path1)
         sstore2_d = StringStore().from_disk(file_path2)
-        assert list(sstore1_d) == list(sstore1)
-        assert list(sstore2_d) == list(sstore2)
-        if strings1 == strings2:
-            assert list(sstore1_d) == list(sstore2_d)
+        assert set(sstore1_d) == set(sstore1)
+        assert set(sstore2_d) == set(sstore2)
+        if set(strings1) == set(strings2):
+            assert set(sstore1_d) == set(sstore2_d)
         else:
-            assert list(sstore1_d) != list(sstore2_d)
+            assert set(sstore1_d) != set(sstore2_d)
 
 
 @pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)

From 73a8c0f9922b30b5b21d514a3b0bec0d53769d11 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Apr 2021 11:53:42 +0200
Subject: [PATCH 088/146] Update debug data further for v3 (#7602)

* Update debug data further for v3

* Remove new/existing label distinction (new labels are not immediately
distinguishable because the pipeline is already initialized)
* Warn on missing labels in training data for all components except parser
* Separate textcat and textcat_multilabel sections
* Add section for morphologizer

* Reword missing label warnings
---
 spacy/cli/debug_data.py | 168 +++++++++++++++++++++++++++++-----------
 1 file changed, 122 insertions(+), 46 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index be11f8d1c..3351e53fe 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -1,4 +1,4 @@
-from typing import List, Sequence, Dict, Any, Tuple, Optional
+from typing import List, Sequence, Dict, Any, Tuple, Optional, Set
 from pathlib import Path
 from collections import Counter
 import sys
@@ -13,6 +13,8 @@ from ..training.initialize import get_sourced_components
 from ..schemas import ConfigSchemaTraining
 from ..pipeline._parser_internals import nonproj
 from ..pipeline._parser_internals.nonproj import DELIMITER
+from ..pipeline import Morphologizer
+from ..morphology import Morphology
 from ..language import Language
 from ..util import registry, resolve_dot_names
 from .. import util
@@ -194,32 +196,32 @@ def debug_data(
         )
         label_counts = gold_train_data["ner"]
         model_labels = _get_labels_from_model(nlp, "ner")
-        new_labels = [l for l in labels if l not in model_labels]
-        existing_labels = [l for l in labels if l in model_labels]
         has_low_data_warning = False
         has_no_neg_warning = False
         has_ws_ents_error = False
         has_punct_ents_warning = False
 
         msg.divider("Named Entity Recognition")
-        msg.info(
-            f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)"
-        )
+        msg.info(f"{len(model_labels)} label(s)")
         missing_values = label_counts["-"]
         msg.text(f"{missing_values} missing value(s) (tokens with '-' label)")
-        for label in new_labels:
+        for label in labels:
             if len(label) == 0:
-                msg.fail("Empty label found in new labels")
-        if new_labels:
-            labels_with_counts = [
-                (label, count)
-                for label, count in label_counts.most_common()
-                if label != "-"
-            ]
-            labels_with_counts = _format_labels(labels_with_counts, counts=True)
-            msg.text(f"New: {labels_with_counts}", show=verbose)
-        if existing_labels:
-            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
+                msg.fail("Empty label found in train data")
+        labels_with_counts = [
+            (label, count)
+            for label, count in label_counts.most_common()
+            if label != "-"
+        ]
+        labels_with_counts = _format_labels(labels_with_counts, counts=True)
+        msg.text(f"Labels in train data: {_format_labels(labels)}", show=verbose)
+        missing_labels = model_labels - labels
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
         if gold_train_data["ws_ents"]:
             msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans")
             has_ws_ents_error = True
@@ -228,10 +230,10 @@ def debug_data(
             msg.warn(f"{gold_train_data['punct_ents']} entity span(s) with punctuation")
             has_punct_ents_warning = True
 
-        for label in new_labels:
+        for label in labels:
             if label_counts[label] <= NEW_LABEL_THRESHOLD:
                 msg.warn(
-                    f"Low number of examples for new label '{label}' ({label_counts[label]})"
+                    f"Low number of examples for label '{label}' ({label_counts[label]})"
                 )
                 has_low_data_warning = True
 
@@ -276,22 +278,52 @@ def debug_data(
             )
 
     if "textcat" in factory_names:
-        msg.divider("Text Classification")
-        labels = [label for label in gold_train_data["cats"]]
-        model_labels = _get_labels_from_model(nlp, "textcat")
-        new_labels = [l for l in labels if l not in model_labels]
-        existing_labels = [l for l in labels if l in model_labels]
-        msg.info(
-            f"Text Classification: {len(new_labels)} new label(s), "
-            f"{len(existing_labels)} existing label(s)"
+        msg.divider("Text Classification (Exclusive Classes)")
+        labels = _get_labels_from_model(nlp, "textcat")
+        msg.info(f"Text Classification: {len(labels)} label(s)")
+        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
+        labels_with_counts = _format_labels(
+            gold_train_data["cats"].most_common(), counts=True
         )
-        if new_labels:
-            labels_with_counts = _format_labels(
-                gold_train_data["cats"].most_common(), counts=True
+        msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
+        missing_labels = labels - set(gold_train_data["cats"].keys())
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
+        if gold_train_data["n_cats_multilabel"] > 0:
+            # Note: you should never get here because you run into E895 on
+            # initialization first.
+            msg.warn(
+                "The train data contains instances without "
+                "mutually-exclusive classes. Use the component "
+                "'textcat_multilabel' instead of 'textcat'."
+            )
+        if gold_dev_data["n_cats_multilabel"] > 0:
+            msg.fail(
+                "Train/dev mismatch: the dev data contains instances "
+                "without mutually-exclusive classes while the train data "
+                "contains only instances with mutually-exclusive classes."
+            )
+
+    if "textcat_multilabel" in factory_names:
+        msg.divider("Text Classification (Multilabel)")
+        labels = _get_labels_from_model(nlp, "textcat_multilabel")
+        msg.info(f"Text Classification: {len(labels)} label(s)")
+        msg.text(f"Labels: {_format_labels(labels)}", show=verbose)
+        labels_with_counts = _format_labels(
+            gold_train_data["cats"].most_common(), counts=True
+        )
+        msg.text(f"Labels in train data: {labels_with_counts}", show=verbose)
+        missing_labels = labels - set(gold_train_data["cats"].keys())
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
             )
-            msg.text(f"New: {labels_with_counts}", show=verbose)
-        if existing_labels:
-            msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose)
         if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
             msg.fail(
                 f"The train and dev labels are not the same. "
@@ -299,11 +331,6 @@ def debug_data(
                 f"Dev labels: {_format_labels(gold_dev_data['cats'])}."
             )
         if gold_train_data["n_cats_multilabel"] > 0:
-            msg.info(
-                "The train data contains instances without "
-                "mutually-exclusive classes. Use '--textcat-multilabel' "
-                "when training."
-            )
             if gold_dev_data["n_cats_multilabel"] == 0:
                 msg.warn(
                     "Potential train/dev mismatch: the train data contains "
@@ -311,9 +338,10 @@ def debug_data(
                     "dev data does not."
                 )
         else:
-            msg.info(
+            msg.warn(
                 "The train data contains only instances with "
-                "mutually-exclusive classes."
+                "mutually-exclusive classes. You can potentially use the "
+                "component 'textcat' instead of 'textcat_multilabel'."
             )
             if gold_dev_data["n_cats_multilabel"] > 0:
                 msg.fail(
@@ -325,13 +353,37 @@ def debug_data(
     if "tagger" in factory_names:
         msg.divider("Part-of-speech Tagging")
         labels = [label for label in gold_train_data["tags"]]
-        # TODO: does this need to be updated?
-        msg.info(f"{len(labels)} label(s) in data")
+        model_labels = _get_labels_from_model(nlp, "tagger")
+        msg.info(f"{len(labels)} label(s) in train data")
+        missing_labels = model_labels - set(labels)
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
         labels_with_counts = _format_labels(
             gold_train_data["tags"].most_common(), counts=True
         )
         msg.text(labels_with_counts, show=verbose)
 
+    if "morphologizer" in factory_names:
+        msg.divider("Morphologizer (POS+Morph)")
+        labels = [label for label in gold_train_data["morphs"]]
+        model_labels = _get_labels_from_model(nlp, "morphologizer")
+        msg.info(f"{len(labels)} label(s) in train data")
+        missing_labels = model_labels - set(labels)
+        if missing_labels:
+            msg.warn(
+                "Some model labels are not present in the train data. The "
+                "model performance may be degraded for these labels after "
+                f"training: {_format_labels(missing_labels)}."
+            )
+        labels_with_counts = _format_labels(
+            gold_train_data["morphs"].most_common(), counts=True
+        )
+        msg.text(labels_with_counts, show=verbose)
+
     if "parser" in factory_names:
         has_low_data_warning = False
         msg.divider("Dependency Parsing")
@@ -491,6 +543,7 @@ def _compile_gold(
         "ner": Counter(),
         "cats": Counter(),
         "tags": Counter(),
+        "morphs": Counter(),
         "deps": Counter(),
         "words": Counter(),
         "roots": Counter(),
@@ -544,13 +597,36 @@ def _compile_gold(
                     data["ner"][combined_label] += 1
                 elif label == "-":
                     data["ner"]["-"] += 1
-        if "textcat" in factory_names:
+        if "textcat" in factory_names or "textcat_multilabel" in factory_names:
             data["cats"].update(gold.cats)
             if list(gold.cats.values()).count(1.0) != 1:
                 data["n_cats_multilabel"] += 1
         if "tagger" in factory_names:
             tags = eg.get_aligned("TAG", as_string=True)
             data["tags"].update([x for x in tags if x is not None])
+        if "morphologizer" in factory_names:
+            pos_tags = eg.get_aligned("POS", as_string=True)
+            morphs = eg.get_aligned("MORPH", as_string=True)
+            for pos, morph in zip(pos_tags, morphs):
+                # POS may align (same value for multiple tokens) when morph
+                # doesn't, so if either is misaligned (None), treat the
+                # annotation as missing so that truths doesn't end up with an
+                # unknown morph+POS combination
+                if pos is None or morph is None:
+                    pass
+                # If both are unset, the annotation is missing (empty morph
+                # converted from int is "_" rather than "")
+                elif pos == "" and morph == "":
+                    pass
+                # Otherwise, generate the combined label
+                else:
+                    label_dict = Morphology.feats_to_dict(morph)
+                    if pos:
+                        label_dict[Morphologizer.POS_FEAT] = pos
+                    label = eg.reference.vocab.strings[
+                        eg.reference.vocab.morphology.add(label_dict)
+                    ]
+                    data["morphs"].update([label])
         if "parser" in factory_names:
             aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
             data["deps"].update([x for x in aligned_deps if x is not None])
@@ -584,8 +660,8 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
     return count
 
 
-def _get_labels_from_model(nlp: Language, pipe_name: str) -> Sequence[str]:
+def _get_labels_from_model(nlp: Language, pipe_name: str) -> Set[str]:
     if pipe_name not in nlp.pipe_names:
         return set()
     pipe = nlp.get_pipe(pipe_name)
-    return pipe.labels
+    return set(pipe.labels)

From 673e2bc4c0ca46f1c026e0823d32f35c52d2f38e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 9 Apr 2021 16:15:38 +0200
Subject: [PATCH 089/146] Add usage docs for streamed train corpora (#7693)

---
 website/docs/api/data-formats.md |  6 +-
 website/docs/usage/training.md   | 98 ++++++++++++++++++++++++++++----
 2 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index ac6f4183d..53ca8a51d 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -193,10 +193,10 @@ process that are used when you run [`spacy train`](/api/cli#train).
 | `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
 | `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
 | `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
-| `max_epochs`          | Maximum number of epochs to train for. Defaults to `0`. ~~int~~                                                                                                                                                                                                                                                                     |
-| `max_steps`           | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                                                           |
+| `max_epochs`          | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
+| `max_steps`           | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
 | `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
-| `patience`            | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~                                                                                                                                                                                                                                     |
+| `patience`            | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
 | `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
 | `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
 | `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 5e9d3303c..9f929fe19 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -1130,8 +1130,8 @@ any other custom workflows. `corpora.train` and `corpora.dev` are used as
 conventions within spaCy's default configs, but you can also define any other
 custom blocks. Each section in the corpora config should resolve to a
 [`Corpus`](/api/corpus) – for example, using spaCy's built-in
-[corpus reader](/api/top-level#readers) that takes a path to a binary `.spacy`
-file. The `train_corpus` and `dev_corpus` fields in the
+[corpus reader](/api/top-level#corpus-readers) that takes a path to a binary
+`.spacy` file. The `train_corpus` and `dev_corpus` fields in the
 [`[training]`](/api/data-formats#config-training) block specify where to find
 the corpus in your config. This makes it easy to **swap out** different corpora
 by only changing a single config setting.
@@ -1142,21 +1142,23 @@ corpora, keyed by corpus name, e.g. `"train"` and `"dev"`. This can be
 especially useful if you need to split a single file into corpora for training
 and evaluation, without loading the same file twice.
 
+By default, the training data is loaded into memory and shuffled before each
+epoch. If the corpus is **too large to fit into memory** during training, stream
+the corpus using a custom reader as described in the next section.
+
 ### Custom data reading and batching {#custom-code-readers-batchers}
 
 Some use-cases require **streaming in data** or manipulating datasets on the
-fly, rather than generating all data beforehand and storing it to file. Instead
+fly, rather than generating all data beforehand and storing it to disk. Instead
 of using the built-in [`Corpus`](/api/corpus) reader, which uses static file
 paths, you can create and register a custom function that generates
-[`Example`](/api/example) objects. The resulting generator can be infinite. When
-using this dataset for training, stopping criteria such as maximum number of
-steps, or stopping when the loss does not decrease further, can be used.
+[`Example`](/api/example) objects.
 
-In this example we assume a custom function `read_custom_data` which loads or
-generates texts with relevant text classification annotations. Then, small
-lexical variations of the input text are created before generating the final
-[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets
-you register the function creating the custom reader in the `readers`
+In the following example we assume a custom function `read_custom_data` which
+loads or generates texts with relevant text classification annotations. Then,
+small lexical variations of the input text are created before generating the
+final [`Example`](/api/example) objects. The `@spacy.registry.readers` decorator
+lets you register the function creating the custom reader in the `readers`
 [registry](/api/top-level#registry) and assign it a string name, so it can be
 used in your config. All arguments on the registered function become available
 as **config settings** – in this case, `source`.
@@ -1199,6 +1201,80 @@ Remember that a registered function should always be a function that spaCy
 
 </Infobox>
 
+If the corpus is **too large to load into memory** or the corpus reader is an
+**infinite generator**, use the setting `max_epochs = -1` to indicate that the
+train corpus should be streamed. With this setting the train corpus is merely
+streamed and batched, not shuffled, so any shuffling needs to be implemented in
+the corpus reader itself. In the example below, a corpus reader that generates
+sentences containing even or odd numbers is used with an unlimited number of
+examples for the train corpus and a limited number of examples for the dev
+corpus. The dev corpus should always be finite and fit in memory during the
+evaluation step. `max_steps` and/or `patience` are used to determine when the
+training should stop.
+
+> #### config.cfg
+>
+> ```ini
+> [corpora.dev]
+> @readers = "even_odd.v1"
+> limit = 100
+>
+> [corpora.train]
+> @readers = "even_odd.v1"
+> limit = -1
+>
+> [training]
+> max_epochs = -1
+> patience = 500
+> max_steps = 2000
+> ```
+
+```python
+### functions.py
+from typing import Callable, Iterable, Iterator
+from spacy import util
+import random
+from spacy.training import Example
+from spacy import Language
+
+
+@util.registry.readers("even_odd.v1")
+def create_even_odd_corpus(limit: int = -1) -> Callable[[Language], Iterable[Example]]:
+    return EvenOddCorpus(limit)
+
+
+class EvenOddCorpus:
+    def __init__(self, limit):
+        self.limit = limit
+
+    def __call__(self, nlp: Language) -> Iterator[Example]:
+        i = 0
+        while i < self.limit or self.limit < 0:
+            r = random.randint(0, 1000)
+            cat = r % 2 == 0
+            text = "This is sentence " + str(r)
+            yield Example.from_dict(
+                nlp.make_doc(text), {"cats": {"EVEN": cat, "ODD": not cat}}
+            )
+            i += 1
+```
+
+> #### config.cfg
+>
+> ```ini
+> [initialize.components.textcat.labels]
+> @readers = "spacy.read_labels.v1"
+> path = "labels/textcat.json"
+> require = true
+> ```
+
+If the train corpus is streamed, the initialize step peeks at the first 100
+examples in the corpus to find the labels for each component. If this isn't
+sufficient, you'll need to [provide the labels](#initialization-labels) for each
+component in the `[initialize]` block. [`init labels`](/api/cli#init-labels) can
+be used to generate JSON files in the correct format, which you can extend with
+the full label set.
+
 We can also customize the **batching strategy** by registering a new batcher
 function in the `batchers` [registry](/api/top-level#registry). A batcher turns
 a stream of items into a stream of batches. spaCy has several useful built-in

From 27dbbb99031c4859272cdd36688547b6b1ba0d0e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 12 Apr 2021 10:08:01 +0200
Subject: [PATCH 090/146] Bugfix/nel crossing sentence (#7630)

* ensure each entity gets a KB ID, even when it's not within a sentence

* cleanup
---
 spacy/pipeline/entity_linker.py          | 140 +++++++++++------------
 spacy/tests/regression/test_issue7065.py |  57 +++++++++
 2 files changed, 127 insertions(+), 70 deletions(-)

diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 630057c3f..6ab52fb35 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -300,77 +300,77 @@ class EntityLinker(TrainablePipe):
         for i, doc in enumerate(docs):
             sentences = [s for s in doc.sents]
             if len(doc) > 0:
-                # Looping through each sentence and each entity
-                # This may go wrong if there are entities across sentences - which shouldn't happen normally.
-                for sent_index, sent in enumerate(sentences):
-                    if sent.ents:
-                        # get n_neightbour sentences, clipped to the length of the document
-                        start_sentence = max(0, sent_index - self.n_sents)
-                        end_sentence = min(
-                            len(sentences) - 1, sent_index + self.n_sents
-                        )
-                        start_token = sentences[start_sentence].start
-                        end_token = sentences[end_sentence].end
-                        sent_doc = doc[start_token:end_token].as_doc()
-                        # currently, the context is the same for each entity in a sentence (should be refined)
-                        xp = self.model.ops.xp
-                        if self.incl_context:
-                            sentence_encoding = self.model.predict([sent_doc])[0]
-                            sentence_encoding_t = sentence_encoding.T
-                            sentence_norm = xp.linalg.norm(sentence_encoding_t)
-                        for ent in sent.ents:
-                            entity_count += 1
-                            if ent.label_ in self.labels_discard:
-                                # ignoring this entity - setting to NIL
-                                final_kb_ids.append(self.NIL)
-                            else:
-                                candidates = self.get_candidates(self.kb, ent)
-                                if not candidates:
-                                    # no prediction possible for this entity - setting to NIL
-                                    final_kb_ids.append(self.NIL)
-                                elif len(candidates) == 1:
-                                    # shortcut for efficiency reasons: take the 1 candidate
-                                    # TODO: thresholding
-                                    final_kb_ids.append(candidates[0].entity_)
-                                else:
-                                    random.shuffle(candidates)
-                                    # set all prior probabilities to 0 if incl_prior=False
-                                    prior_probs = xp.asarray(
-                                        [c.prior_prob for c in candidates]
+                # Looping through each entity (TODO: rewrite)
+                for ent in doc.ents:
+                    sent = ent.sent
+                    sent_index = sentences.index(sent)
+                    assert sent_index >= 0
+                    # get n_neightbour sentences, clipped to the length of the document
+                    start_sentence = max(0, sent_index - self.n_sents)
+                    end_sentence = min(
+                        len(sentences) - 1, sent_index + self.n_sents
+                    )
+                    start_token = sentences[start_sentence].start
+                    end_token = sentences[end_sentence].end
+                    sent_doc = doc[start_token:end_token].as_doc()
+                    # currently, the context is the same for each entity in a sentence (should be refined)
+                    xp = self.model.ops.xp
+                    if self.incl_context:
+                        sentence_encoding = self.model.predict([sent_doc])[0]
+                        sentence_encoding_t = sentence_encoding.T
+                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
+                    entity_count += 1
+                    if ent.label_ in self.labels_discard:
+                        # ignoring this entity - setting to NIL
+                        final_kb_ids.append(self.NIL)
+                    else:
+                        candidates = self.get_candidates(self.kb, ent)
+                        if not candidates:
+                            # no prediction possible for this entity - setting to NIL
+                            final_kb_ids.append(self.NIL)
+                        elif len(candidates) == 1:
+                            # shortcut for efficiency reasons: take the 1 candidate
+                            # TODO: thresholding
+                            final_kb_ids.append(candidates[0].entity_)
+                        else:
+                            random.shuffle(candidates)
+                            # set all prior probabilities to 0 if incl_prior=False
+                            prior_probs = xp.asarray(
+                                [c.prior_prob for c in candidates]
+                            )
+                            if not self.incl_prior:
+                                prior_probs = xp.asarray(
+                                    [0.0 for _ in candidates]
+                                )
+                            scores = prior_probs
+                            # add in similarity from the context
+                            if self.incl_context:
+                                entity_encodings = xp.asarray(
+                                    [c.entity_vector for c in candidates]
+                                )
+                                entity_norm = xp.linalg.norm(
+                                    entity_encodings, axis=1
+                                )
+                                if len(entity_encodings) != len(prior_probs):
+                                    raise RuntimeError(
+                                        Errors.E147.format(
+                                            method="predict",
+                                            msg="vectors not of equal length",
+                                        )
                                     )
-                                    if not self.incl_prior:
-                                        prior_probs = xp.asarray(
-                                            [0.0 for _ in candidates]
-                                        )
-                                    scores = prior_probs
-                                    # add in similarity from the context
-                                    if self.incl_context:
-                                        entity_encodings = xp.asarray(
-                                            [c.entity_vector for c in candidates]
-                                        )
-                                        entity_norm = xp.linalg.norm(
-                                            entity_encodings, axis=1
-                                        )
-                                        if len(entity_encodings) != len(prior_probs):
-                                            raise RuntimeError(
-                                                Errors.E147.format(
-                                                    method="predict",
-                                                    msg="vectors not of equal length",
-                                                )
-                                            )
-                                        # cosine similarity
-                                        sims = xp.dot(
-                                            entity_encodings, sentence_encoding_t
-                                        ) / (sentence_norm * entity_norm)
-                                        if sims.shape != prior_probs.shape:
-                                            raise ValueError(Errors.E161)
-                                        scores = (
-                                            prior_probs + sims - (prior_probs * sims)
-                                        )
-                                    # TODO: thresholding
-                                    best_index = scores.argmax().item()
-                                    best_candidate = candidates[best_index]
-                                    final_kb_ids.append(best_candidate.entity_)
+                                # cosine similarity
+                                sims = xp.dot(
+                                    entity_encodings, sentence_encoding_t
+                                ) / (sentence_norm * entity_norm)
+                                if sims.shape != prior_probs.shape:
+                                    raise ValueError(Errors.E161)
+                                scores = (
+                                    prior_probs + sims - (prior_probs * sims)
+                                )
+                            # TODO: thresholding
+                            best_index = scores.argmax().item()
+                            best_candidate = candidates[best_index]
+                            final_kb_ids.append(best_candidate.entity_)
         if not (len(final_kb_ids) == entity_count):
             err = Errors.E147.format(
                 method="predict", msg="result variables not of equal length"
diff --git a/spacy/tests/regression/test_issue7065.py b/spacy/tests/regression/test_issue7065.py
index 897687d19..63d36552a 100644
--- a/spacy/tests/regression/test_issue7065.py
+++ b/spacy/tests/regression/test_issue7065.py
@@ -1,4 +1,6 @@
+from spacy.kb import KnowledgeBase
 from spacy.lang.en import English
+from spacy.training import Example
 
 
 def test_issue7065():
@@ -16,3 +18,58 @@ def test_issue7065():
     ent = doc.ents[0]
     assert ent.start < sent0.end < ent.end
     assert sentences.index(ent.sent) == 0
+
+
+def test_issue7065_b():
+    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
+    nlp = English()
+    vector_length = 3
+    nlp.add_pipe("sentencizer")
+
+    text = "Mahler 's Symphony No. 8 was beautiful."
+    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
+    links = {(0, 6): {"Q7304": 1.0, "Q270853": 0.0},
+             (10, 24): {"Q7304": 0.0, "Q270853": 1.0}}
+    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
+    doc = nlp(text)
+    example = Example.from_dict(doc, {"entities": entities, "links": links, "sent_starts": sent_starts})
+    train_examples = [example]
+
+    def create_kb(vocab):
+        # create artificial KB
+        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
+        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
+        mykb.add_alias(
+            alias="No. 8",
+            entities=["Q270853"],
+            probabilities=[1.0],
+        )
+        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
+        mykb.add_alias(
+            alias="Mahler",
+            entities=["Q7304"],
+            probabilities=[1.0],
+        )
+        return mykb
+
+    # Create the Entity Linker component and add it to the pipeline
+    entity_linker = nlp.add_pipe("entity_linker", last=True)
+    entity_linker.set_kb(create_kb)
+
+    # train the NEL pipe
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    for i in range(2):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+
+    # Add a custom rule-based component to mimick NER
+    patterns = [
+        {"label": "PERSON", "pattern": [{"LOWER": "mahler"}]},
+        {"label": "WORK", "pattern": [{"LOWER": "symphony"}, {"LOWER": "no"}, {"LOWER": "."}, {"LOWER": "8"}]}
+    ]
+    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
+    ruler.add_patterns(patterns)
+
+    # test the trained model - this should not throw E148
+    doc = nlp(text)
+    assert doc

From 8d7af5b2b155c36ced0082a227839887605da0e6 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 12 Apr 2021 14:35:57 +0200
Subject: [PATCH 091/146] Ensure hyphen in config file works as string value
 (#7642)

* add test for serializing '-' in a config file

* bump srsly to 2.4.1
---
 requirements.txt                              |  2 +-
 setup.cfg                                     |  2 +-
 .../tests/serialize/test_serialize_config.py  | 31 ++++++++++++++++++-
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index f86efff3f..5be39f59f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,7 @@ blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.1,<1.1.0
-srsly>=2.4.0,<3.0.0
+srsly>=2.4.1,<3.0.0
 catalogue>=2.0.1,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy>=0.3.5
diff --git a/setup.cfg b/setup.cfg
index 92e758aec..f8672034f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -44,7 +44,7 @@ install_requires =
     thinc>=8.0.2,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
-    srsly>=2.4.0,<3.0.0
+    srsly>=2.4.1,<3.0.0
     catalogue>=2.0.1,<2.1.0
     typer>=0.3.0,<0.4.0
     pathy>=0.3.5
diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py
index 66b66b744..2cd0e4ab6 100644
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@@ -4,7 +4,7 @@ import spacy
 from spacy.lang.en import English
 from spacy.lang.de import German
 from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
-from spacy.util import registry, load_model_from_config, load_config
+from spacy.util import registry, load_model_from_config, load_config, load_config_from_str
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
 from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
@@ -465,3 +465,32 @@ def test_config_only_resolve_relevant_blocks():
         nlp.initialize()
     nlp.config["initialize"]["lookups"] = None
     nlp.initialize()
+
+
+def test_hyphen_in_config():
+    hyphen_config_str = """
+    [nlp]
+    lang = "en"
+    pipeline = ["my_punctual_component"]
+
+    [components]
+
+    [components.my_punctual_component]
+    factory = "my_punctual_component"
+    punctuation = ["?","-"]
+    """
+
+    @spacy.Language.factory("my_punctual_component")
+    class MyPunctualComponent(object):
+        name = "my_punctual_component"
+
+        def __init__(
+            self,
+            nlp,
+            name,
+            punctuation,
+        ):
+            self.punctuation = punctuation
+
+    nlp = English.from_config(load_config_from_str(hyphen_config_str))
+    assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-']

From ed561cf428494c2b7a6790cd4b91b5326102b59d Mon Sep 17 00:00:00 2001
From: Bram Vanroy <Bram.Vanroy@UGent.be>
Date: Mon, 12 Apr 2021 14:37:00 +0200
Subject: [PATCH 092/146] Terminology: deprecated vs obsolete (#7621)

* Terminology: deprecated vs obsolete

Typically, deprecated is used for functionality that is bound to become unavailable but that can still be used. Obsolete is used for features that have been removed. In E941, I think what is meant is "obsolete" since loading a model by a shortcut simply does not work anymore (and throws an error). This is different from downloading a model with a shortcut, which is deprecated but still works.

In light of this, perhaps all other error codes should be checked as well.

* clarify that the link command is removed and not just deprecated

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
---
 spacy/cli/__init__.py    |  4 ++--
 spacy/errors.py          |  2 +-
 website/docs/usage/v3.md | 27 ++++++++++++++-------------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 7368bcef3..56c0e0f46 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -29,9 +29,9 @@ from .project.document import project_document  # noqa: F401
 
 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
 def link(*args, **kwargs):
-    """As of spaCy v3.0, symlinks like "en" are deprecated. You can load trained
+    """As of spaCy v3.0, symlinks like "en" are not supported anymore. You can load trained
     pipeline packages using their full names or from a directory path."""
     msg.warn(
-        "As of spaCy v3.0, model symlinks are deprecated. You can load trained "
+        "As of spaCy v3.0, model symlinks are not supported anymore. You can load trained "
         "pipeline packages using their full names or from a directory path."
     )
diff --git a/spacy/errors.py b/spacy/errors.py
index 89b09c09a..e4e331d42 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -636,7 +636,7 @@ class Errors:
             "method, make sure it's overwritten on the subclass.")
     E940 = ("Found NaN values in scores.")
     E941 = ("Can't find model '{name}'. It looks like you're trying to load a "
-            "model from a shortcut, which is deprecated as of spaCy v3.0. To "
+            "model from a shortcut, which is obsolete as of spaCy v3.0. To "
             "load the model, use its full name instead:\n\n"
             "nlp = spacy.load(\"{full}\")\n\nFor more details on the available "
             "models, see the models directory: https://spacy.io/models. If you "
diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md
index 847d4a327..8b4d2de7c 100644
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@@ -616,11 +616,11 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
 | `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          |
 | `spacy link`, `util.set_data_path`, `util.get_data_path`                                     | not needed, symlinks are deprecated                                                                                                                                                                                      |
 
-The following deprecated methods, attributes and arguments were removed in v3.0.
-Most of them have been **deprecated for a while** and many would previously
-raise errors. Many of them were also mostly internals. If you've been working
-with more recent versions of spaCy v2.x, it's **unlikely** that your code relied
-on them.
+The following methods, attributes and arguments were removed in v3.0. Most of
+them have been **deprecated for a while** and many would previously raise
+errors. Many of them were also mostly internals. If you've been working with
+more recent versions of spaCy v2.x, it's **unlikely** that your code relied on
+them.
 
 | Removed                                                                                                                 | Replacement                                                                                                                                                |
 | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -637,10 +637,10 @@ on them.
 
 ### Downloading and loading trained pipelines {#migrating-downloading-models}
 
-Symlinks and shortcuts like `en` are now officially deprecated. There are
-[many different trained pipelines](/models) with different capabilities and not
-just one "English model". In order to download and load a package, you should
-always use its full name – for instance,
+Symlinks and shortcuts like `en` have been deprecated for a while, and are now
+not supported anymore. There are [many different trained pipelines](/models)
+with different capabilities and not just one "English model". In order to
+download and load a package, you should always use its full name – for instance,
 [`en_core_web_sm`](/models/en#en_core_web_sm).
 
 ```diff
@@ -1185,9 +1185,10 @@ package isn't imported.
 In Jupyter notebooks, run [`prefer_gpu`](/api/top-level#spacy.prefer_gpu),
 [`require_gpu`](/api/top-level#spacy.require_gpu) or
 [`require_cpu`](/api/top-level#spacy.require_cpu) in the same cell as
-[`spacy.load`](/api/top-level#spacy.load) to ensure that the model is loaded on the correct device.
+[`spacy.load`](/api/top-level#spacy.load) to ensure that the model is loaded on
+the correct device.
 
-Due to a bug related to `contextvars` (see the [bug
-report](https://github.com/ipython/ipython/issues/11565)), the GPU settings may
-not be preserved correctly across cells, resulting in models being loaded on
+Due to a bug related to `contextvars` (see the
+[bug report](https://github.com/ipython/ipython/issues/11565)), the GPU settings
+may not be preserved correctly across cells, resulting in models being loaded on
 the wrong device or only partially on GPU.

From 05bdbe28bbe581bb9f7a3d236c2447d46b0b254e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 19 Apr 2021 10:30:03 +0200
Subject: [PATCH 093/146] Fix vectors data on GPU (#7626)

* ensure vectors data is stored on right device

* ensure the added vector is on the right device

* move vector to numpy before iterating

* move best_rows to numpy before iterating
---
 spacy/vectors.pyx | 17 ++++++++++++-----
 spacy/vocab.pyx   | 12 +++++++-----
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index bcea87e67..7cb3322c2 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -55,7 +55,7 @@ cdef class Vectors:
         """Create a new vector store.
 
         shape (tuple): Size of the table, as (# entries, # columns)
-        data (numpy.ndarray): The vector data.
+        data (numpy.ndarray or cupy.ndarray): The vector data.
         keys (iterable): A sequence of keys, aligned with the data.
         name (str): A name to identify the vectors table.
 
@@ -65,7 +65,8 @@ cdef class Vectors:
         if data is None:
             if shape is None:
                 shape = (0,0)
-            data = numpy.zeros(shape, dtype="f")
+            ops = get_current_ops()
+            data = ops.xp.zeros(shape, dtype="f")
         self.data = data
         self.key2row = {}
         if self.data is not None:
@@ -300,6 +301,8 @@ cdef class Vectors:
         else:
             raise ValueError(Errors.E197.format(row=row, key=key))
         if vector is not None:
+            xp = get_array_module(self.data)
+            vector = xp.asarray(vector)
             self.data[row] = vector
         if self._unset.count(row):
             self._unset.erase(self._unset.find(row))
@@ -321,10 +324,11 @@ cdef class Vectors:
         RETURNS (tuple): The most similar entries as a `(keys, best_rows, scores)`
             tuple.
         """
+        xp = get_array_module(self.data)
         filled = sorted(list({row for row in self.key2row.values()}))
         if len(filled) < n:
             raise ValueError(Errors.E198.format(n=n, n_rows=len(filled)))
-        xp = get_array_module(self.data)
+        filled = xp.asarray(filled)
 
         norms = xp.linalg.norm(self.data[filled], axis=1, keepdims=True)
         norms[norms == 0] = 1
@@ -357,8 +361,10 @@ cdef class Vectors:
         # Account for numerical error we want to return in range -1, 1
         scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
         row2key = {row: key for key, row in self.key2row.items()}
+
+        numpy_rows = get_current_ops().to_numpy(best_rows)
         keys = xp.asarray(
-            [[row2key[row] for row in best_rows[i] if row in row2key]
+            [[row2key[row] for row in numpy_rows[i] if row in row2key]
                     for i in range(len(queries)) ], dtype="uint64")
         return (keys, best_rows, scores)
 
@@ -459,7 +465,8 @@ cdef class Vectors:
             if hasattr(self.data, "from_bytes"):
                 self.data.from_bytes()
             else:
-                self.data = srsly.msgpack_loads(b)
+                xp = get_array_module(self.data)
+                self.data = xp.asarray(srsly.msgpack_loads(b))
 
         deserializers = {
             "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)),
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 1008797b3..ee440898a 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -2,7 +2,7 @@
 from libc.string cimport memcpy
 
 import srsly
-from thinc.api import get_array_module
+from thinc.api import get_array_module, get_current_ops
 import functools
 
 from .lexeme cimport EMPTY_LEXEME, OOV_RANK
@@ -293,7 +293,7 @@ cdef class Vocab:
         among those remaining.
 
         For example, suppose the original table had vectors for the words:
-        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to,
+        ['sat', 'cat', 'feline', 'reclined']. If we prune the vector table to
         two rows, we would discard the vectors for 'feline' and 'reclined'.
         These words would then be remapped to the closest remaining vector
         -- so "feline" would have the same vector as "cat", and "reclined"
@@ -314,6 +314,7 @@ cdef class Vocab:
 
         DOCS: https://spacy.io/api/vocab#prune_vectors
         """
+        ops = get_current_ops()
         xp = get_array_module(self.vectors.data)
         # Make sure all vectors are in the vocab
         for orth in self.vectors:
@@ -329,8 +330,9 @@ cdef class Vocab:
         toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
         self.vectors = Vectors(data=keep, keys=keys[:nr_row], name=self.vectors.name)
         syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
+        syn_keys = ops.to_numpy(syn_keys)
         remap = {}
-        for i, key in enumerate(keys[nr_row:]):
+        for i, key in enumerate(ops.to_numpy(keys[nr_row:])):
             self.vectors.add(key, row=syn_rows[i][0])
             word = self.strings[key]
             synonym = self.strings[syn_keys[i][0]]
@@ -351,7 +353,7 @@ cdef class Vocab:
             Defaults to the length of `orth`.
         maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
             Defaults to the length of `orth`.
-        RETURNS (numpy.ndarray): A word vector. Size
+        RETURNS (numpy.ndarray or cupy.ndarray): A word vector. Size
             and shape determined by the `vocab.vectors` instance. Usually, a
             numpy ndarray of shape (300,) and dtype float32.
 
@@ -400,7 +402,7 @@ cdef class Vocab:
         by string or int ID.
 
         orth (int / unicode): The word.
-        vector (numpy.ndarray[ndim=1, dtype='float32']): The vector to set.
+        vector (numpy.ndarray or cupy.nadarry[ndim=1, dtype='float32']): The vector to set.
 
         DOCS: https://spacy.io/api/vocab#set_vector
         """

From 1ad646cbcf0015cb3b944f98bef1b3a9eeb54e9f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 19 Apr 2021 10:36:32 +0200
Subject: [PATCH 094/146] Improve checks for sourced components (#7490)

* Improve checks for sourced components

* Remove language class checks

* Convert python warning to logger warning

* Remove unused warning

* Fix formatting
---
 spacy/errors.py                             |  6 ++--
 spacy/language.py                           |  9 ++++--
 spacy/tests/pipeline/test_pipe_factories.py | 34 +++++++++++++++++++++
 3 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index e4e331d42..453e98b59 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -159,6 +159,8 @@ class Warnings:
             "http://spacy.io/usage/v3#jupyter-notebook-gpu")
     W112 = ("The model specified to use for initial vectors ({name}) has no "
             "vectors. This is almost certainly a mistake.")
+    W113 = ("Sourced component '{name}' may not work as expected: source "
+            "vectors are not identical to current pipeline vectors.")
 
 
 @add_codes
@@ -651,8 +653,8 @@ class Errors:
             "returned the initialized nlp object instead?")
     E944 = ("Can't copy pipeline component '{name}' from source '{model}': "
             "not found in pipeline. Available components: {opts}")
-    E945 = ("Can't copy pipeline component '{name}' from source. Expected loaded "
-            "nlp object, but got: {source}")
+    E945 = ("Can't copy pipeline component '{name}' from source. Expected "
+            "loaded nlp object, but got: {source}")
     E947 = ("`Matcher.add` received invalid `greedy` argument: expected "
             "a string value from {expected} but got: '{arg}'")
     E948 = ("`Matcher.add` received invalid 'patterns' argument: expected "
diff --git a/spacy/language.py b/spacy/language.py
index 68bd3cd4c..6f6470533 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -682,9 +682,14 @@ class Language:
         name (str): Optional alternative name to use in current pipeline.
         RETURNS (Tuple[Callable, str]): The component and its factory name.
         """
-        # TODO: handle errors and mismatches (vectors etc.)
-        if not isinstance(source, self.__class__):
+        # Check source type
+        if not isinstance(source, Language):
             raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
+        # Check vectors, with faster checks first
+        if self.vocab.vectors.shape != source.vocab.vectors.shape or \
+                self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
+                self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
+            util.logger.warning(Warnings.W113.format(name=source_name))
         if not source_name in source.component_names:
             raise KeyError(
                 Errors.E944.format(
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index e1706ffb1..a7071abfd 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,4 +1,6 @@
 import pytest
+import mock
+import logging
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
@@ -402,6 +404,38 @@ def test_pipe_factories_from_source():
         nlp.add_pipe("custom", source=source_nlp)
 
 
+def test_pipe_factories_from_source_language_subclass():
+    class CustomEnglishDefaults(English.Defaults):
+        stop_words = set(["custom", "stop"])
+
+    @registry.languages("custom_en")
+    class CustomEnglish(English):
+        lang = "custom_en"
+        Defaults = CustomEnglishDefaults
+
+    source_nlp = English()
+    source_nlp.add_pipe("tagger")
+
+    # custom subclass
+    nlp = CustomEnglish()
+    nlp.add_pipe("tagger", source=source_nlp)
+    assert "tagger" in nlp.pipe_names
+
+    # non-subclass
+    nlp = German()
+    nlp.add_pipe("tagger", source=source_nlp)
+    assert "tagger" in nlp.pipe_names
+
+    # mismatched vectors
+    nlp = English()
+    nlp.vocab.vectors.resize((1, 4))
+    nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
+    logger = logging.getLogger("spacy")
+    with mock.patch.object(logger, "warning") as mock_warning:
+        nlp.add_pipe("tagger", source=source_nlp)
+        mock_warning.assert_called()
+
+
 def test_pipe_factories_from_source_custom():
     """Test adding components from a source model with custom components."""
     name = "test_pipe_factories_from_source_custom"

From 15bd230413ba2da67e0c06e03a8337dc209f5a83 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 19 Apr 2021 10:37:17 +0200
Subject: [PATCH 095/146] Set catalogue lower pin to v2.0.3 (#7762)

* Set catalogue lower pin to v2.0.2

* Update importlib-metadata pins to match

* Require catalogue v2.0.3

Switch to vendored `importlib-metadata` v3.2.0 provided by `catalogue`.
---
 requirements.txt | 3 +--
 setup.cfg        | 3 +--
 spacy/util.py    | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5be39f59f..86ffa9945 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.1,<1.1.0
 srsly>=2.4.1,<3.0.0
-catalogue>=2.0.1,<2.1.0
+catalogue>=2.0.3,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy>=0.3.5
 # Third party dependencies
@@ -20,7 +20,6 @@ jinja2
 # Official Python utilities
 setuptools
 packaging>=20.0
-importlib_metadata>=0.20; python_version < "3.8"
 typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
 # Development dependencies
 cython>=0.25
diff --git a/setup.cfg b/setup.cfg
index f8672034f..6c65277c6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,7 +45,7 @@ install_requires =
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.1,<3.0.0
-    catalogue>=2.0.1,<2.1.0
+    catalogue>=2.0.3,<2.1.0
     typer>=0.3.0,<0.4.0
     pathy>=0.3.5
     # Third-party dependencies
@@ -57,7 +57,6 @@ install_requires =
     # Official Python utilities
     setuptools
     packaging>=20.0
-    importlib_metadata>=0.20; python_version < "3.8"
     typing_extensions>=3.7.4,<4.0.0.0; python_version < "3.8"
 
 [options.entry_points]
diff --git a/spacy/util.py b/spacy/util.py
index 9915de935..d8854f68d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -36,7 +36,7 @@ except ImportError:
 try:  # Python 3.8
     import importlib.metadata as importlib_metadata
 except ImportError:
-    import importlib_metadata
+    from catalogue import _importlib_metadata as importlib_metadata
 
 # These are functions that were previously (v2.x) available from spacy.util
 # and have since moved to Thinc. We're importing them here so people's code

From c786e98e56eaba201812758c3ff023395ed650eb Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 19 Apr 2021 10:39:11 +0200
Subject: [PATCH 096/146] assemble CLI command (#7783)

* assemble CLI command

* ensure assemble runs even without training section

* cleanup
---
 spacy/cli/__init__.py   |  1 +
 spacy/cli/assemble.py   | 58 +++++++++++++++++++++++++++++++++++++++++
 website/docs/api/cli.md | 29 +++++++++++++++++++++
 3 files changed, 88 insertions(+)
 create mode 100644 spacy/cli/assemble.py

diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 56c0e0f46..fd8da262e 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -9,6 +9,7 @@ from .info import info  # noqa: F401
 from .package import package  # noqa: F401
 from .profile import profile  # noqa: F401
 from .train import train_cli  # noqa: F401
+from .assemble import assemble_cli  # noqa: F401
 from .pretrain import pretrain  # noqa: F401
 from .debug_data import debug_data  # noqa: F401
 from .debug_config import debug_config  # noqa: F401
diff --git a/spacy/cli/assemble.py b/spacy/cli/assemble.py
new file mode 100644
index 000000000..f63c51857
--- /dev/null
+++ b/spacy/cli/assemble.py
@@ -0,0 +1,58 @@
+from typing import Optional
+from pathlib import Path
+from wasabi import msg
+import typer
+import logging
+
+from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
+from ._util import import_code
+from ..training.initialize import init_nlp
+from .. import util
+from ..util import get_sourced_components, load_model_from_config
+
+
+@app.command(
+    "assemble",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def assemble_cli(
+    # fmt: off
+    ctx: typer.Context,  # This is only used to read additional arguments
+    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
+    output_path: Path = Arg(..., help="Output directory to store assembled pipeline in"),
+    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
+    verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
+    # fmt: on
+):
+    """
+    Assemble a spaCy pipeline from a config file. The config file includes
+    all settings for initializing the pipeline. To override settings in the
+    config, e.g. settings that point to local paths or that you want to
+    experiment with, you can override them as command line options. The
+    --code argument lets you pass in a Python file that can be used to
+    register custom functions that are referenced in the config.
+
+    DOCS: https://spacy.io/api/cli#assemble
+    """
+    util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+    # Make sure all files and paths exists if they are needed
+    if not config_path or (str(config_path) != "-" and not config_path.exists()):
+        msg.fail("Config file not found", config_path, exits=1)
+    overrides = parse_config_overrides(ctx.args)
+    import_code(code_path)
+    with show_validation_error(config_path):
+        config = util.load_config(config_path, overrides=overrides, interpolate=False)
+    msg.divider("Initializing pipeline")
+    nlp = load_model_from_config(config, auto_fill=True)
+    config = config.interpolate()
+    sourced = get_sourced_components(config)
+    # Make sure that listeners are defined before initializing further
+    nlp._link_components()
+    with nlp.select_pipes(disable=[*sourced]):
+        nlp.initialize()
+    msg.good("Initialized pipeline")
+    msg.divider("Serializing to disk")
+    if output_path is not None and not output_path.exists():
+        output_path.mkdir(parents=True)
+        msg.good(f"Created output directory: {output_path}")
+    nlp.to_disk(output_path)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 73a03cba8..196e47543 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -12,6 +12,7 @@ menu:
   - ['train', 'train']
   - ['pretrain', 'pretrain']
   - ['evaluate', 'evaluate']
+  - ['assemble', 'assemble']
   - ['package', 'package']
   - ['project', 'project']
   - ['ray', 'ray']
@@ -892,6 +893,34 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
 | `--help`, `-h`                            | Show help message and available arguments. ~~bool (flag)~~                                                                                                                           |
 | **CREATES**                               | Training results and optional metrics and visualizations.                                                                                                                            |
 
+## assemble {#assemble tag="command"}
+
+Assemble a pipeline from a config file without additional training. Expects a
+[config file](/api/data-formats#config) with all settings and hyperparameters.
+The `--code` argument can be used to import a Python file that lets you register
+[custom functions](/usage/training#custom-functions) and refer to them in your
+config.
+
+> #### Example
+>
+> ```cli
+> $ python -m spacy assemble config.cfg ./output
+> ```
+
+```cli
+$ python -m spacy assemble [config_path] [output_dir] [--code] [--verbose] [overrides]
+```
+
+| Name              | Description                                                                                                                                                                                                   |
+| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `config_path`     | Path to the [config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
+| `output_dir`      | Directory to store the final pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(option)~~                                                                                                   |
+| `--code`, `-c`    | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions). ~~Optional[Path] \(option)~~                                                |
+| `--verbose`, `-V` | Show more detailed messages during processing. ~~bool (flag)~~                                                                                                                                                |
+| `--help`, `-h`    | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                    |
+| overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.data ./data`. ~~Any (option/flag)~~                            |
+| **CREATES**       | The final assembled pipeline.                                                                                                                                                                                 |
+
 ## package {#package tag="command"}
 
 Generate an installable [Python package](/usage/training#models-generating) from

From 07b41c38aedf49eed9615caea4e54cd621093f0b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 19 Apr 2021 10:39:34 +0200
Subject: [PATCH 097/146] Register CharEmbed layer (#7805)

---
 spacy/ml/_character_embed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/ml/_character_embed.py b/spacy/ml/_character_embed.py
index f5c539c42..0ed28b859 100644
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@@ -3,8 +3,10 @@ from thinc.api import Model
 from thinc.types import Floats2d
 
 from ..tokens import Doc
+from ..util import registry
 
 
+@registry.layers("spacy.CharEmbed.v1")
 def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
     # nM: Number of dimensions per character. nC: Number of characters.
     return Model(

From 0e7f94b247e0e616439339c66588871a4be30750 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 19 Apr 2021 11:08:20 +0200
Subject: [PATCH 098/146] Update Tokenizer.explain with special matches (#7749)

* Update Tokenizer.explain with special matches

Update `Tokenizer.explain` and the pseudo-code in the docs to include
the processing of special cases that contain affixes or whitespace.

* Handle optional settings in explain

* Add test for special matches in explain

Add test for `Tokenizer.explain` for special cases containing affixes.
---
 spacy/tests/tokenizer/test_explain.py     | 17 ++++++++++
 spacy/tokenizer.pyx                       | 39 +++++++++++++++++++++--
 website/docs/usage/linguistic-features.md |  7 +++-
 3 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
index ea6cf91be..0a10ae67d 100644
--- a/spacy/tests/tokenizer/test_explain.py
+++ b/spacy/tests/tokenizer/test_explain.py
@@ -1,5 +1,7 @@
 import pytest
+import re
 from spacy.util import get_lang_class
+from spacy.tokenizer import Tokenizer
 
 # Only include languages with no external dependencies
 # "is" seems to confuse importlib, so we're also excluding it for now
@@ -60,3 +62,18 @@ def test_tokenizer_explain(lang):
         tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
         debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
         assert tokens == debug_tokens
+
+
+def test_tokenizer_explain_special_matcher(en_vocab):
+    suffix_re = re.compile(r"[\.]$")
+    infix_re = re.compile(r"[/]")
+    rules = {"a.": [{"ORTH": "a."}]}
+    tokenizer = Tokenizer(
+        en_vocab,
+        rules=rules,
+        suffix_search=suffix_re.search,
+        infix_finditer=infix_re.finditer,
+    )
+    tokens = [t.text for t in tokenizer("a/a.")]
+    explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
+    assert tokens == explain_tokens
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 5bd6e7aa3..41bbaeee6 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -20,11 +20,12 @@ from .attrs import intify_attrs
 from .symbols import ORTH, NORM
 from .errors import Errors, Warnings
 from . import util
-from .util import registry
+from .util import registry, get_words_and_spaces
 from .attrs import intify_attrs
 from .symbols import ORTH
 from .scorer import Scorer
 from .training import validate_examples
+from .tokens import Span
 
 
 cdef class Tokenizer:
@@ -638,8 +639,14 @@ cdef class Tokenizer:
         DOCS: https://spacy.io/api/tokenizer#explain
         """
         prefix_search = self.prefix_search
+        if prefix_search is None:
+            prefix_search = re.compile("a^").search
         suffix_search = self.suffix_search
+        if suffix_search is None:
+            suffix_search = re.compile("a^").search
         infix_finditer = self.infix_finditer
+        if infix_finditer is None:
+            infix_finditer = re.compile("a^").finditer
         token_match = self.token_match
         if token_match is None:
             token_match = re.compile("a^").match
@@ -687,7 +694,7 @@ cdef class Tokenizer:
                     tokens.append(("URL_MATCH", substring))
                     substring = ''
                 elif substring in special_cases:
-                    tokens.extend(("SPECIAL-" + str(i + 1), self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
+                    tokens.extend((f"SPECIAL-{i + 1}", self.vocab.strings[e[ORTH]]) for i, e in enumerate(special_cases[substring]))
                     substring = ''
                 elif list(infix_finditer(substring)):
                     infixes = infix_finditer(substring)
@@ -705,7 +712,33 @@ cdef class Tokenizer:
                     tokens.append(("TOKEN", substring))
                     substring = ''
             tokens.extend(reversed(suffixes))
-        return tokens
+        # Find matches for special cases handled by special matcher
+        words, spaces = get_words_and_spaces([t[1] for t in tokens], text)
+        t_words = []
+        t_spaces = []
+        for word, space in zip(words, spaces):
+            if not word.isspace():
+                t_words.append(word)
+                t_spaces.append(space)
+        doc = Doc(self.vocab, words=t_words, spaces=t_spaces)
+        matches = self._special_matcher(doc)
+        spans = [Span(doc, s, e, label=m_id) for m_id, s, e in matches]
+        spans = util.filter_spans(spans)
+        # Replace matched tokens with their exceptions
+        i = 0
+        final_tokens = []
+        spans_by_start = {s.start: s for s in spans}
+        while i < len(tokens):
+            if i in spans_by_start:
+                span = spans_by_start[i]
+                exc = [d[ORTH] for d in special_cases[span.label_]]
+                for j, orth in enumerate(exc):
+                    final_tokens.append((f"SPECIAL-{j + 1}", self.vocab.strings[orth]))
+                i += len(span)
+            else:
+                final_tokens.append(tokens[i])
+                i += 1
+        return final_tokens
 
     def score(self, examples, **kwargs):
         validate_examples(examples, "Tokenizer.score")
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 2d3390049..077b1a556 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -786,6 +786,7 @@ rather than performance:
 
 ```python
 def tokenizer_pseudo_code(
+    text,
     special_cases,
     prefix_search,
     suffix_search,
@@ -839,12 +840,14 @@ def tokenizer_pseudo_code(
                 tokens.append(substring)
                 substring = ""
         tokens.extend(reversed(suffixes))
+    for match in matcher(special_cases, text):
+        tokens.replace(match, special_cases[match])
     return tokens
 ```
 
 The algorithm can be summarized as follows:
 
-1. Iterate over whitespace-separated substrings.
+1. Iterate over space-separated substrings.
 2. Look for a token match. If there is a match, stop processing and keep this
    token.
 3. Check whether we have an explicitly defined special case for this substring.
@@ -858,6 +861,8 @@ The algorithm can be summarized as follows:
 8. Look for "infixes" – stuff like hyphens etc. and split the substring into
    tokens on all infixes.
 9. Once we can't consume any more of the string, handle it as a single token.
+10. Make a final pass over the text to check for special cases that include
+    spaces or that were missed due to the incremental processing of affixes.
 
 </Accordion>
 

From df541c6b5e435ed5a1e125c720449535f8c7da86 Mon Sep 17 00:00:00 2001
From: langdonholmes <55119338+langdonholmes@users.noreply.github.com>
Date: Mon, 19 Apr 2021 02:58:12 -0700
Subject: [PATCH 099/146] Update processing-pipelines.md to mention method for
 doc metadata (#7480)

* Update processing-pipelines.md

Under "things to try," inform users they can save metadata when using nlp.pipe(foobar, as_tuples=True)

Link to a new example on the attributes page detailing the following:

> ```
> data = [
>   ("Some text to process", {"meta": "foo"}),
>   ("And more text...", {"meta": "bar"})
> ]
>
> for doc, context in nlp.pipe(data, as_tuples=True):
>     # Let's assume you have a "meta" extension registered on the Doc
>     doc._.meta = context["meta"]
> ```

from https://stackoverflow.com/questions/57058798/make-spacy-nlp-pipe-process-tuples-of-text-and-additional-information-to-add-as

* Updating the attributes section

Update the attributes section with example of how extensions can be used to store metadata.

* Update processing-pipelines.md

* Update processing-pipelines.md

Made as_tuples example executable and relocated to the end of the "Processing Text" section.

* Update processing-pipelines.md

* Update processing-pipelines.md

Removed extra line

* Reformat and rephrase

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/usage/processing-pipelines.md | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 52568658d..bde3ab84f 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -91,6 +91,37 @@ have to call `list()` on it first:
 
 </Infobox>
 
+You can use the `as_tuples` option to pass additional context along with each
+doc when using [`nlp.pipe`](/api/language#pipe). If `as_tuples` is `True`, then
+the input should be a sequence of `(text, context)` tuples and the output will
+be a sequence of `(doc, context)` tuples. For example, you can pass metadata in
+the context and save it in a [custom attribute](#custom-components-attributes):
+
+```python
+### {executable="true"}
+import spacy
+from spacy.tokens import Doc
+
+if not Doc.has_extension("text_id"):
+    Doc.set_extension("text_id", default=None)
+
+text_tuples = [
+    ("This is the first text.", {"text_id": "text1"}),
+    ("This is the second text.", {"text_id": "text2"})
+]
+
+nlp = spacy.load("en_core_web_sm")
+doc_tuples = nlp.pipe(text_tuples, as_tuples=True)
+
+docs = []
+for doc, context in doc_tuples:
+    doc._.text_id = context["text_id"]
+    docs.append(doc)
+
+for doc in docs:
+    print(f"{doc._.text_id}: {doc.text}")
+```
+
 ### Multiprocessing {#multiprocessing}
 
 spaCy includes built-in support for multiprocessing with
@@ -1373,6 +1404,8 @@ There are three main types of extensions, which can be defined using the
 [`Span.set_extension`](/api/span#set_extension) and
 [`Token.set_extension`](/api/token#set_extension) methods.
 
+## Description
+
 1. **Attribute extensions.** Set a default value for an attribute, which can be
    overwritten manually at any time. Attribute extensions work like "normal"
    variables and are the quickest way to store arbitrary information on a `Doc`,

From 2722424ec57247dbea43c2f8de40f5e2286125d2 Mon Sep 17 00:00:00 2001
From: hudsonr <richard.hudson@msg.group>
Date: Mon, 19 Apr 2021 14:28:06 +0200
Subject: [PATCH 100/146] Added universe entry for Coreferee

---
 website/meta/universe.json | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index dcc9ce3d4..01aa058b5 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2223,6 +2223,39 @@
                 "github": "richardpaulhudson"
             }
         },
+        {
+            "id": "coreferee",
+            "title": "Coreferee",
+            "slogan": "Coreference resolution for multiple languages",
+            "github": "msg-systems/coreferee",
+            "url": "https://github.com/msg-systems/coreferee",
+            "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/msg-systems/coreferee#getting-started) before running the code example.",
+            "pip": "coreferee",
+            "category": ["pipeline", "models", "standalone"],
+            "tags": ["coreference-resolution", "anaphora"],
+            "code_example": [
+                "import coreferee, spacy",
+                "nlp = spacy.load('en_core_web_trf')",
+                "nlp.add_pipe('coreferee')",
+                "doc = nlp('Although he was very busy with his work, Peter had had enough of it. He and his wife decided they needed a holiday. They travelled to Spain because they loved the country very much.')",
+                "doc._.coref_chains.print()",
+                "# Output:",
+                "#",
+                "# 0: he(1), his(6), Peter(9), He(16), his(18)",
+                "# 1: work(7), it(14)",
+                "# 2: [He(16); wife(19)], they(21), They(26), they(31)",
+                "# 3: Spain(29), country(34)",
+                "#",
+                "print(doc._.coref_chains.resolve(doc[31]))",
+                "# Output:",
+                "#",
+                "# [Peter, wife]"
+            ],
+            "author": "Richard Paul Hudson",
+            "author_links": {
+                "github": "richardpaulhudson"
+            }
+        },
         {
             "id": "spacy-transformers",
             "title": "spacy-transformers",

From 6017fcf69311c78f8ebc0bd1c6e3a2d2873d42dc Mon Sep 17 00:00:00 2001
From: Shantam Raj <shantamdps@gmail.com>
Date: Wed, 21 Apr 2021 12:46:32 +0530
Subject: [PATCH 101/146] Default code for Setting Entity annotations on the
 website errors (#7738)

* the default example for "Setting entity annotations" errors on Binder

* updating contributer info

* using a new variable to store original entities
---
 .github/contributors/armsp.md             | 6 +++---
 website/docs/usage/linguistic-features.md | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/contributors/armsp.md b/.github/contributors/armsp.md
index 63d1367e4..45607d69c 100644
--- a/.github/contributors/armsp.md
+++ b/.github/contributors/armsp.md
@@ -98,9 +98,9 @@ mark both statements:
 
 | Field                          | Entry                |
 |------------------------------- | -------------------- |
-| Name                           |  Shantam             |
+| Name                           |  Shantam Raj         |
 | Company name (if applicable)   |                      |
 | Title or role (if applicable)  |                      |
-| Date                           |   21/5/2018          |
+| Date                           |   10/4/2021          |
 | GitHub username                |     armsp            |
-| Website (optional)             |                      |
+| Website (optional)             |https://shantamraj.com|
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 077b1a556..5a1293c2e 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -601,12 +601,13 @@ print('Before', ents)
 
 # Create a span for the new entity
 fb_ent = Span(doc, 0, 1, label="ORG")
+orig_ents = list(doc.ents)
 
 # Option 1: Modify the provided entity spans, leaving the rest unmodified
 doc.set_ents([fb_ent], default="unmodified")
 
 # Option 2: Assign a complete list of ents to doc.ents
-doc.ents = list(doc.ents) + [fb_ent]
+doc.ents = orig_ents + [fb_ent]
 
 ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
 print('After', ents)

From debfb46088ee7eb53a02f80f0d5fa0ea7a8ee5bf Mon Sep 17 00:00:00 2001
From: Pierre Lison <plison@nr.no>
Date: Thu, 22 Apr 2021 00:58:09 +0200
Subject: [PATCH 102/146] adding skweak to the SpaCy universe

---
 website/meta/universe.json | 55 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 01aa058b5..7b13e9ac2 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,58 @@
 {
     "resources": [
+        {
+            "id": "skweak",
+            "title": "skweak",
+            "slogan": "Weak supervision for NLP",
+            "description": "`skweak` brings the power of weak supervision to NLP tasks, and in particular sequence labelling and text classification. Instead of annotating documents by hand, `skweak` allows you to define *labelling functions* to automatically label your documents, and then aggregate their results using a statistical model that estimates the accuracy and confusions of each labelling function.",
+            "github": "https://github.com/NorskRegnesentral/skweak",
+            "pip": "skweak",
+            "code_example": [
+                "import spacy, re",
+                "from skweak import heuristics, gazetteers, aggregation, utils",
+                "",
+                "# LF 1: heuristic to detect occurrences of MONEY entities",
+                "def money_detector(doc):",
+                "   for tok in doc[1:]:",
+                "      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:",
+                "          yield tok.i-1, tok.i+1, 'MONEY'",
+                "lf1 = heuristics.FunctionAnnotator('money', money_detector)",
+                "",
+                "# LF 2: detection of years with a regex",
+                "lf2= heuristics.TokenConstraintAnnotator ('years', lambda tok: re.match('(19|20)\\d{2}$', tok.text), 'DATE')",
+                "",
+                "# LF 3: a gazetteer with a few names",
+                "NAMES = [('Barack', 'Obama'), ('Donald', 'Trump'), ('Joe', 'Biden')]",
+                "trie = gazetteers.Trie(NAMES)",
+                "lf3 = gazetteers.GazetteerAnnotator('presidents', {'PERSON':trie})",
+                "",
+                "# We create a corpus (here with a single text)",
+                "nlp = spacy.load('en_core_web_sm')",
+                "doc = nlp('Donald Trump paid $750 in federal income taxes in 2016')",
+                "",
+                "# apply the labelling functions",
+                "doc = lf3(lf2(lf1(doc)))",
+                "",
+                "# and aggregate them",
+                "hmm = aggregation.HMM('hmm', ['PERSON', 'DATE', 'MONEY'])",
+                "hmm.fit_and_aggregate([doc])",
+                "",
+                "# we can then visualise the final result (in Jupyter)",
+                "utils.display_entities(doc, 'hmm')"
+            ],
+            "code_language": "python",
+            "url": "https://github.com/NorskRegnesentral/skweak",
+            "thumb": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo_thumbnail.jpg",
+            "image": "https://raw.githubusercontent.com/NorskRegnesentral/skweak/main/data/skweak_logo.jpg",
+            "author": "Pierre Lison",
+            "author_links": {
+                "twitter": "plison2",
+                "github": "plison",
+                "website": "https://www.nr.no/~plison"
+            },
+            "category": ["pipeline", "standalone", "research", "training"],
+            "tags": []
+        },
 	{
 	    "id": "numerizer",
 	    "title": "numerizer",
@@ -3002,4 +3055,4 @@
             ]
         }
     ]
-}
+}
\ No newline at end of file

From 2f0ef2c9ccb6aa584f00b5213f44bf089f7ba9ff Mon Sep 17 00:00:00 2001
From: Pierre Lison <plison@nr.no>
Date: Thu, 22 Apr 2021 01:16:34 +0200
Subject: [PATCH 103/146] adding skweak to the SpaCy universe

---
 .github/contributors/plison.md | 106 +++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/plison.md

diff --git a/.github/contributors/plison.md b/.github/contributors/plison.md
new file mode 100644
index 000000000..e98b096b4
--- /dev/null
+++ b/.github/contributors/plison.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [ ] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Pierre Lison         |
+| Company name (if applicable)   | Norsk Regnesentral   |
+| Title or role (if applicable)  | Senior Researcher    |
+| Date                           | 22.04.2021           |
+| GitHub username                | plison               |
+| Website (optional)             | www.nr.no/~plison    |

From a9e5ae9b5c8877d47993bcf35105294f15809eb9 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 22 Apr 2021 10:58:05 +1000
Subject: [PATCH 104/146] Auto-format [ci skip]

---
 website/meta/universe.json | 46 +++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 7b13e9ac2..04e2a2b04 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -53,28 +53,28 @@
             "category": ["pipeline", "standalone", "research", "training"],
             "tags": []
         },
-	{
-	    "id": "numerizer",
-	    "title": "numerizer",
-	    "slogan": "Convert natural language numerics into ints and floats.",
-	    "description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.",
-	    "github": "jaidevd/numerizer",
-	    "pip": "numerizer",
-	    "code_example": [
-		"from spacy import load",
-		"import numerizer",
-		"nlp = load('en_core_web_sm') # or any other model",
-		"doc = nlp('The Hogwarts Express is at platform nine and three quarters')",
-		"doc._.numerize()",
-		"# {nine and three quarters: '9.75'}"
-	    ],
-	    "author": "Jaidev Deshpande",
-	    "author_links": {
-		"github": "jaidevd",
-		"twitter": "jaidevd"
-	    },
-	    "category": ["standalone"]
-	},
+        {
+            "id": "numerizer",
+            "title": "numerizer",
+            "slogan": "Convert natural language numerics into ints and floats.",
+            "description": "A SpaCy extension for Docs, Spans and Tokens that converts numerical words and quantitative named entities into numeric strings.",
+            "github": "jaidevd/numerizer",
+            "pip": "numerizer",
+            "code_example": [
+                "from spacy import load",
+                "import numerizer",
+                "nlp = load('en_core_web_sm') # or any other model",
+                "doc = nlp('The Hogwarts Express is at platform nine and three quarters')",
+                "doc._.numerize()",
+                "# {nine and three quarters: '9.75'}"
+            ],
+            "author": "Jaidev Deshpande",
+            "author_links": {
+                "github": "jaidevd",
+                "twitter": "jaidevd"
+            },
+            "category": ["standalone"]
+        },
         {
             "id": "spikex",
             "title": "SpikeX - SpaCy Pipes for Knowledge Extraction",
@@ -3055,4 +3055,4 @@
             ]
         }
     ]
-}
\ No newline at end of file
+}

From bbade153ed56fa2e6db8855266508e7eda5b446d Mon Sep 17 00:00:00 2001
From: Diego Palma <dpalmasan@gmail.com>
Date: Thu, 22 Apr 2021 02:40:28 -0400
Subject: [PATCH 105/146] Add TRUNAJOD to spaCy universe. (#7754)

* Add TRUNAJOD to spaCy universe.

* Add trunajod logo and thumb.

Co-authored-by: Diego <dpalma@evernote.com>
---
 website/meta/universe.json | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 04e2a2b04..983846d48 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2970,6 +2970,41 @@
             },
             "category": ["pipeline", "standalone"],
             "tags": ["Text Analytics", "Russian"]
+        },
+        {
+            "id": "trunajod",
+            "title": "TRUNAJOD",
+            "slogan": "A text complexity library for text analysis built on spaCy",
+            "description": "With all the basic NLP capabilities provided by spaCy (dependency parsing, POS tagging, tokenizing), `TRUNAJOD` focuses on extracting measurements from texts that might be interesting for different applications and use cases.",
+            "github": "dpalmasan/TRUNAJOD2.0",
+            "pip": "trunajod",
+            "code_example": [
+                "import spacy",
+                "from TRUNAJOD.entity_grid import EntityGrid",
+                "",
+                "nlp = spacy.load('es_core_news_sm', disable=['ner', 'textcat'])",
+                "example_text = (",
+                "    'El espectáculo del cielo nocturno cautiva la mirada y suscita preguntas'",
+                "    'sobre el universo, su origen y su funcionamiento. No es sorprendente que '",
+                "    'todas las civilizaciones y culturas hayan formado sus propias '",
+                "    'cosmologías. Unas relatan, por ejemplo, que el universo ha'",
+                "    'sido siempre tal como es, con ciclos que inmutablemente se repiten; '",
+                "    'otras explican que este universo ha tenido un principio, '",
+                "    'que ha aparecido por obra creadora de una divinidad.'",
+                ")",
+                "doc = nlp(example_text)",
+                "egrid = EntityGrid(doc)",
+                "print(egrid.get_egrid())"
+            ],
+            "code_language": "python",
+            "thumb": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_thumb.png",
+            "image": "https://raw.githubusercontent.com/dpalmasan/TRUNAJOD2.0/master/imgs/trunajod_logo.png",
+            "author": "Diego Palma",
+            "author_links": {
+                "github": "dpalmasan"
+            },
+            "category": ["research", "standalone", "scientific"],
+            "tags": ["Text Analytics", "Coherence", "Cohesion"]
         }
     ],
 

From b8c6c10c6faaedee61140b9e439dd3242a698820 Mon Sep 17 00:00:00 2001
From: Sam Edwardes <edwardes.s@gmail.com>
Date: Wed, 21 Apr 2021 23:41:55 -0700
Subject: [PATCH 106/146] Added a logo to spaCyTextBlob (#7818)

* Added a logo to spaCyTextBlob

* Updated to better thumb
---
 website/meta/universe.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 983846d48..50137ee99 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -138,6 +138,7 @@
             "id": "spacy-textblob",
             "title": "spaCyTextBlob",
             "slogan": "Easy sentiment analysis for spaCy using TextBlob. Now supports spaCy 3.0!",
+            "thumb": "https://github.com/SamEdwardes/spaCyTextBlob/raw/main/website/static/img/logo-thumb-square-250x250.png",
             "description": "spaCyTextBlob is a pipeline component that enables sentiment analysis using the [TextBlob](https://github.com/sloria/TextBlob) library. It will add the additional extensions `._.polarity`, `._.subjectivity`, and `._.assessments` to `Doc`, `Span`, and `Token` objects. For spaCy 2 please use `pip install pip install spacytextblob==0.1.7`",
             "github": "SamEdwardes/spaCyTextBlob",
             "pip": "spacytextblob",

From 49ff1126bf272b62128ff4677a5c636ac525d6be Mon Sep 17 00:00:00 2001
From: meghanabhange <meghanabhange13@gmail.com>
Date: Thu, 22 Apr 2021 12:18:17 +0530
Subject: [PATCH 107/146] Project Idea : denomme | Multilingual Name Detection
 (#7845)

* Add denomme

* spaCy contributor agreement

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 .github/contributors/meghanabhange.md | 107 ++++++++++++++++++++++++++
 website/meta/universe.json            |  24 ++++++
 2 files changed, 131 insertions(+)
 create mode 100644 .github/contributors/meghanabhange.md

diff --git a/.github/contributors/meghanabhange.md b/.github/contributors/meghanabhange.md
new file mode 100644
index 000000000..2aaa57d10
--- /dev/null
+++ b/.github/contributors/meghanabhange.md
@@ -0,0 +1,107 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your 
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                    |
+|------------------------------- | ------------------------ |
+| Name                           | Meghana Bhange            |
+| Company name (if applicable)   | Verloop.io                 |
+| Title or role (if applicable)  | ML Engineer        |
+| Date                           | 2020-04-21               |
+| GitHub username                | meghanbhange                  |
+| Website (optional)             | https://meghana.blog |
+
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 50137ee99..a0183c15d 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,6 +1,30 @@
 {
     "resources": [
         {
+            "id": "denomme",
+            "title": "denomme : Multilingual Name Detector",
+            "slogan": "Multilingual Name Detection",
+            "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone",
+            "github": "meghanabhange/denomme",
+            "pip": "denomme",
+            "code_example": [
+                "from spacy.lang.xx import MultiLanguage",
+                "from denomme.name import person_name_component",
+                "nlp = MultiLanguage()",
+                "nlp.add_pipe('denomme')",
+                "doc = nlp('Hi my name is Meghana S.R Bhange and I want to talk Asha')",
+                "print(doc._.person_name)",
+                "# ['Meghana S.R Bhange', 'Asha']"
+            ],
+            "author": "Meghana Bhange",
+            "author_links": {
+            "github": "meghanabhange",
+            "twitter": "_aspiringcat"
+            },
+            "category": ["standalone"],
+            "tags": ["person-name-detection"]
+        },
+	      {
             "id": "skweak",
             "title": "skweak",
             "slogan": "Weak supervision for NLP",

From 2e746dbf32c8641bc4e0a97420b43c41954ff665 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 22 Apr 2021 08:50:09 +0200
Subject: [PATCH 108/146] update EL training data format in docs (#7839)

* update EL training data format

* fix typo

* all -1 because reasons
---
 website/docs/api/data-formats.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 53ca8a51d..1196d20af 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -451,9 +451,11 @@ doc = nlp("I'm pretty happy about that!")
 gold_dict = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
 example = Example.from_dict(doc, gold_dict)
 
-# Training data for an Entity Linking component
+# Training data for an Entity Linking component (also requires entities & sentences)
 doc = nlp("Russ Cochran his reprints include EC Comics.")
-gold_dict = {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}}
+gold_dict = {"entities": [(0, 12, "PERSON")],
+             "links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}},
+             "sent_starts": [1, -1, -1, -1, -1, -1, -1, -1]}
 example = Example.from_dict(doc, gold_dict)
 ```
 

From 6f565cf39dc5ca494344883643f98373ebe2654c Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 22 Apr 2021 09:59:24 +0200
Subject: [PATCH 109/146] fix typo in entity_linker docs

---
 website/docs/api/entitylinker.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 1cc864059..b3a1054fc 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -142,7 +142,7 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 | `batch_size`   | The number of documents to buffer. Defaults to `128`. ~~int~~ |
 | **YIELDS**     | The processed documents in order. ~~Doc~~                     |
 
-## EntityLinker.set_kb {#initialize tag="method" new="3"}
+## EntityLinker.set_kb {#set_kb tag="method" new="3"}
 
 The `kb_loader` should be a function that takes a `Vocab` instance and creates
 the `KnowledgeBase`, ensuring that the strings of the knowledge base are synced

From d2bdaa78231b62ae06afbd8ad92a22e0cd9cf1dd Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 22 Apr 2021 10:04:15 +0200
Subject: [PATCH 110/146] Replace negative rows with 0 in StaticVectors (#7674)

* Replace negative rows with 0 in StaticVectors

Replace negative row indices with 0-vectors in `StaticVectors`.

* Increase versions related to StaticVectors

* Increase versions of all architctures and layers related to
`StaticVectors`
* Improve efficiency of 0-vector operations

Parallel `spacy-legacy` PR: https://github.com/explosion/spacy-legacy/pull/5

* Update config defaults to new versions

* Update docs
---
 requirements.txt                              |  2 +-
 setup.cfg                                     |  2 +-
 spacy/cli/templates/quickstart_training.jinja |  2 +-
 spacy/ml/models/tok2vec.py                    |  6 +-
 spacy/ml/staticvectors.py                     |  4 +-
 spacy/pipeline/dep_parser.pyx                 |  2 +-
 spacy/pipeline/entity_linker.py               |  2 +-
 spacy/pipeline/morphologizer.pyx              |  2 +-
 spacy/pipeline/multitask.pyx                  |  2 +-
 spacy/pipeline/ner.pyx                        |  2 +-
 spacy/pipeline/senter.pyx                     |  2 +-
 spacy/pipeline/tagger.pyx                     |  2 +-
 spacy/pipeline/textcat.py                     |  4 +-
 spacy/pipeline/textcat_multilabel.py          |  4 +-
 spacy/pipeline/tok2vec.py                     |  2 +-
 website/docs/api/architectures.md             | 33 ++++----
 website/docs/api/data-formats.md              |  4 +-
 website/docs/api/legacy.md                    | 82 +++++++++++++------
 website/docs/usage/embeddings-transformers.md |  8 +-
 website/docs/usage/layers-architectures.md    |  8 +-
 20 files changed, 106 insertions(+), 69 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 86ffa9945..91fac7894 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.2,<3.1.0
+spacy-legacy>=3.0.3,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.2,<8.1.0
diff --git a/setup.cfg b/setup.cfg
index 6c65277c6..c60d78fc4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     thinc>=8.0.2,<8.1.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.2,<3.1.0
+    spacy-legacy>=3.0.3,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 38fc23272..e43c21bbd 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -206,7 +206,7 @@ factory = "tok2vec"
 @architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
 {% if has_letters -%}
 attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 5790af631..76ec87054 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -31,7 +31,7 @@ def get_tok2vec_width(model: Model):
     return nO
 
 
-@registry.architectures("spacy.HashEmbedCNN.v1")
+@registry.architectures("spacy.HashEmbedCNN.v2")
 def build_hash_embed_cnn_tok2vec(
     *,
     width: int,
@@ -108,7 +108,7 @@ def build_Tok2Vec_model(
     return tok2vec
 
 
-@registry.architectures("spacy.MultiHashEmbed.v1")
+@registry.architectures("spacy.MultiHashEmbed.v2")
 def MultiHashEmbed(
     width: int,
     attrs: List[Union[str, int]],
@@ -182,7 +182,7 @@ def MultiHashEmbed(
     return model
 
 
-@registry.architectures("spacy.CharacterEmbed.v1")
+@registry.architectures("spacy.CharacterEmbed.v2")
 def CharacterEmbed(
     width: int,
     rows: int,
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index ea4c7fb77..cfd25c24b 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -8,7 +8,7 @@ from ..tokens import Doc
 from ..errors import Errors
 
 
-@registry.layers("spacy.StaticVectors.v1")
+@registry.layers("spacy.StaticVectors.v2")
 def StaticVectors(
     nO: Optional[int] = None,
     nM: Optional[int] = None,
@@ -46,6 +46,8 @@ def forward(
         vectors_data = model.ops.gemm(model.ops.as_contig(V[rows]), W, trans2=True)
     except ValueError:
         raise RuntimeError(Errors.E896)
+    # Convert negative indices to 0-vectors (TODO: more options for UNK tokens)
+    vectors_data[rows < 0] = 0
     output = Ragged(
         vectors_data, model.ops.asarray([len(doc) for doc in docs], dtype="i")
     )
diff --git a/spacy/pipeline/dep_parser.pyx b/spacy/pipeline/dep_parser.pyx
index 7290c4637..37f09ce3a 100644
--- a/spacy/pipeline/dep_parser.pyx
+++ b/spacy/pipeline/dep_parser.pyx
@@ -24,7 +24,7 @@ maxout_pieces = 2
 use_upper = true
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 6ab52fb35..66070916e 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -26,7 +26,7 @@ default_model_config = """
 @architectures = "spacy.EntityLinker.v1"
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 2
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index cd0081346..3ba05e616 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -27,7 +27,7 @@ default_model_config = """
 @architectures = "spacy.Tok2Vec.v2"
 
 [model.tok2vec.embed]
-@architectures = "spacy.CharacterEmbed.v1"
+@architectures = "spacy.CharacterEmbed.v2"
 width = 128
 rows = 7000
 nM = 64
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index 990b6a1de..8c44061e2 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -22,7 +22,7 @@ maxout_pieces = 3
 token_vector_width = 96
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/spacy/pipeline/ner.pyx b/spacy/pipeline/ner.pyx
index 3a2151b01..0b9b0d324 100644
--- a/spacy/pipeline/ner.pyx
+++ b/spacy/pipeline/ner.pyx
@@ -21,7 +21,7 @@ maxout_pieces = 2
 use_upper = true
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 83cd06739..f9472abf5 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -19,7 +19,7 @@ default_model_config = """
 @architectures = "spacy.Tagger.v1"
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 12
 depth = 1
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 9af5245c1..938131f6f 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -26,7 +26,7 @@ default_model_config = """
 @architectures = "spacy.Tagger.v1"
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 174ffd273..1d652a483 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -21,7 +21,7 @@ single_label_default_config = """
 @architectures = "spacy.Tok2Vec.v2"
 
 [model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
 rows = [2000, 2000, 1000, 1000, 1000, 1000]
 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@@ -56,7 +56,7 @@ single_label_cnn_config = """
 exclusive_classes = true
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 036bc8dc5..7267735b4 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -21,7 +21,7 @@ multi_label_default_config = """
 @architectures = "spacy.Tok2Vec.v1"
 
 [model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
 rows = [2000, 2000, 1000, 1000, 1000, 1000]
 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@@ -56,7 +56,7 @@ multi_label_cnn_config = """
 exclusive_classes = false
 
 [model.tok2vec]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 26a4c998c..3ee324d50 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -11,7 +11,7 @@ from ..errors import Errors
 
 default_model_config = """
 [model]
-@architectures = "spacy.HashEmbedCNN.v1"
+@architectures = "spacy.HashEmbedCNN.v2"
 pretrained_vectors = null
 width = 96
 depth = 4
diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md
index 4c4bf73f4..e09352ec9 100644
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@@ -35,7 +35,7 @@ usage documentation on
 > @architectures = "spacy.Tok2Vec.v2"
 >
 > [model.embed]
-> @architectures = "spacy.CharacterEmbed.v1"
+> @architectures = "spacy.CharacterEmbed.v2"
 > # ...
 >
 > [model.encode]
@@ -54,13 +54,13 @@ blog post for background.
 | `encode`    | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~            |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                           |
 
-### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
+### spacy.HashEmbedCNN.v2 {#HashEmbedCNN}
 
 > #### Example Config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.HashEmbedCNN.v1"
+> @architectures = "spacy.HashEmbedCNN.v2"
 > pretrained_vectors = null
 > width = 96
 > depth = 4
@@ -96,7 +96,7 @@ consisting of a CNN and a layer-normalized maxout activation function.
 > factory = "tok2vec"
 >
 > [components.tok2vec.model]
-> @architectures = "spacy.HashEmbedCNN.v1"
+> @architectures = "spacy.HashEmbedCNN.v2"
 > width = 342
 >
 > [components.tagger]
@@ -129,13 +129,13 @@ argument that connects to the shared `tok2vec` component in the pipeline.
 | `upstream`  | A string to identify the "upstream" `Tok2Vec` component to communicate with. By default, the upstream name is the wildcard string `"*"`, but you could also specify the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                               |
 
-### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
+### spacy.MultiHashEmbed.v2 {#MultiHashEmbed}
 
 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.MultiHashEmbed.v1"
+> @architectures = "spacy.MultiHashEmbed.v2"
 > width = 64
 > attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
 > rows = [2000, 1000, 1000, 1000]
@@ -160,13 +160,13 @@ not updated).
 | `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [`Doc`](/api/doc) objects' vocab. ~~bool~~                                                                                                                                                                                                                                                                                                                   |
 | **CREATES**              | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                                                                                                                                                                                                                                             |
 
-### spacy.CharacterEmbed.v1 {#CharacterEmbed}
+### spacy.CharacterEmbed.v2 {#CharacterEmbed}
 
 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.CharacterEmbed.v1"
+> @architectures = "spacy.CharacterEmbed.v2"
 > width = 128
 > rows = 7000
 > nM = 64
@@ -266,13 +266,13 @@ Encode context using bidirectional LSTM layers. Requires
 | `dropout`   | Creates a Dropout layer on the outputs of each LSTM layer except the last layer. Set to 0.0 to disable this functionality. ~~float~~                                                                           |
 | **CREATES** | The model using the architecture. ~~Model[List[Floats2d], List[Floats2d]]~~                                                                                                                                    |
 
-### spacy.StaticVectors.v1 {#StaticVectors}
+### spacy.StaticVectors.v2 {#StaticVectors}
 
 > #### Example config
 >
 > ```ini
 > [model]
-> @architectures = "spacy.StaticVectors.v1"
+> @architectures = "spacy.StaticVectors.v2"
 > nO = null
 > nM = null
 > dropout = 0.2
@@ -283,8 +283,9 @@ Encode context using bidirectional LSTM layers. Requires
 > ```
 
 Embed [`Doc`](/api/doc) objects with their vocab's vectors table, applying a
-learned linear projection to control the dimensionality. See the documentation
-on [static vectors](/usage/embeddings-transformers#static-vectors) for details.
+learned linear projection to control the dimensionality. Unknown tokens are
+mapped to a zero vector. See the documentation on [static
+vectors](/usage/embeddings-transformers#static-vectors) for details.
 
 | Name        |  Description                                                                                                                                                                                                            |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@@ -513,7 +514,7 @@ for a Tok2Vec layer.
 > use_upper = true
 >
 > [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
+> @architectures = "spacy.HashEmbedCNN.v2"
 > pretrained_vectors = null
 > width = 96
 > depth = 4
@@ -619,7 +620,7 @@ single-label use-cases where `exclusive_classes = true`, while the
 > @architectures = "spacy.Tok2Vec.v2"
 >
 > [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v1"
+> @architectures = "spacy.MultiHashEmbed.v2"
 > width = 64
 > rows = [2000, 2000, 1000, 1000, 1000, 1000]
 > attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@@ -676,7 +677,7 @@ taking it as argument:
 > nO = null
 >
 > [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
+> @architectures = "spacy.HashEmbedCNN.v2"
 > pretrained_vectors = null
 > width = 96
 > depth = 4
@@ -744,7 +745,7 @@ into the "real world". This requires 3 main components:
 > nO = null
 >
 > [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
+> @architectures = "spacy.HashEmbedCNN.v2"
 > pretrained_vectors = null
 > width = 96
 > depth = 2
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 1196d20af..69bdae446 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -29,8 +29,8 @@ recommended settings for your use case, check out the
 >
 > The `@` syntax lets you refer to function names registered in the
 > [function registry](/api/top-level#registry). For example,
-> `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of
-> the name [spacy.HashEmbedCNN.v1](/api/architectures#HashEmbedCNN) and all
+> `@architectures = "spacy.HashEmbedCNN.v2"` refers to a registered function of
+> the name [spacy.HashEmbedCNN.v2](/api/architectures#HashEmbedCNN) and all
 > other values defined in its block will be passed into that function as
 > arguments. Those arguments depend on the registered function. See the usage
 > guide on [registered functions](/usage/training#config-functions) for details.
diff --git a/website/docs/api/legacy.md b/website/docs/api/legacy.md
index 3e5c7f75f..96bc199bf 100644
--- a/website/docs/api/legacy.md
+++ b/website/docs/api/legacy.md
@@ -4,12 +4,13 @@ teaser: Archived implementations available through spacy-legacy
 source: spacy/legacy
 ---
 
-The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes 
-outdated registered functions and architectures. It is installed automatically as 
-a dependency of spaCy, and provides backwards compatibility for archived functions 
-that may still be used in projects.
+The [`spacy-legacy`](https://github.com/explosion/spacy-legacy) package includes
+outdated registered functions and architectures. It is installed automatically
+as a dependency of spaCy, and provides backwards compatibility for archived
+functions that may still be used in projects.
 
-You can find the detailed documentation of each such legacy function on this page.
+You can find the detailed documentation of each such legacy function on this
+page.
 
 ## Architectures {#architectures}
 
@@ -17,8 +18,8 @@ These functions are available from `@spacy.registry.architectures`.
 
 ### spacy.Tok2Vec.v1 {#Tok2Vec_v1}
 
-The `spacy.Tok2Vec.v1` architecture was expecting an `encode` model of type 
-`Model[Floats2D, Floats2D]` such as `spacy.MaxoutWindowEncoder.v1` or 
+The `spacy.Tok2Vec.v1` architecture was expecting an `encode` model of type
+`Model[Floats2D, Floats2D]` such as `spacy.MaxoutWindowEncoder.v1` or
 `spacy.MishWindowEncoder.v1`.
 
 > #### Example config
@@ -44,15 +45,14 @@ blog post for background.
 | Name        | Description                                                                                                                                                                                                                      |
 | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `embed`     | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
-| `encode`    | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~                            |
+| `encode`    | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder.v1](/api/legacy#MaxoutWindowEncoder_v1). ~~Model[Floats2d, Floats2d]~~                         |
 | **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                                                                           |
 
 ### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder_v1}
 
-The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type 
-`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been changed to output 
-type `Model[List[Floats2d], List[Floats2d]]`.
-
+The `spacy.MaxoutWindowEncoder.v1` architecture was producing a model of type
+`Model[Floats2D, Floats2D]`. Since `spacy.MaxoutWindowEncoder.v2`, this has been
+changed to output type `Model[List[Floats2d], List[Floats2d]]`.
 
 > #### Example config
 >
@@ -78,9 +78,9 @@ and residual connections.
 
 ### spacy.MishWindowEncoder.v1 {#MishWindowEncoder_v1}
 
-The `spacy.MishWindowEncoder.v1` architecture was producing a model of type 
-`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been changed to output 
-type `Model[List[Floats2d], List[Floats2d]]`.
+The `spacy.MishWindowEncoder.v1` architecture was producing a model of type
+`Model[Floats2D, Floats2D]`. Since `spacy.MishWindowEncoder.v2`, this has been
+changed to output type `Model[List[Floats2d], List[Floats2d]]`.
 
 > #### Example config
 >
@@ -103,12 +103,11 @@ and residual connections.
 | `depth`       | The number of convolutional layers. Recommended value is `4`. ~~int~~                                                                                                                                          |
 | **CREATES**   | The model using the architecture. ~~Model[Floats2d, Floats2d]~~                                                                                                                                                |
 
-
 ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble_v1}
 
-The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and `linear_model`. 
-Since `spacy.TextCatEnsemble.v2`, this has been refactored so that the `TextCatEnsemble` takes these 
-two sublayers as input.
+The `spacy.TextCatEnsemble.v1` architecture built an internal `tok2vec` and
+`linear_model`. Since `spacy.TextCatEnsemble.v2`, this has been refactored so
+that the `TextCatEnsemble` takes these two sublayers as input.
 
 > #### Example Config
 >
@@ -142,6 +141,40 @@ network has an internal CNN Tok2Vec layer and uses attention.
 | `nO`                 | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
 | **CREATES**          | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
 
+### spacy.HashEmbedCNN.v1 {#HashEmbedCNN_v1}
+
+Identical to [`spacy.HashEmbedCNN.v2`](/api/architectures#HashEmbedCNN) except
+using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are included.
+
+### spacy.MultiHashEmbed.v1 {#MultiHashEmbed_v1}
+
+Identical to [`spacy.MultiHashEmbed.v2`](/api/architectures#MultiHashEmbed)
+except with [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
+included.
+
+### spacy.CharacterEmbed.v1 {#CharacterEmbed_v1}
+
+Identical to [`spacy.CharacterEmbed.v2`](/api/architectures#CharacterEmbed)
+except using [`spacy.StaticVectors.v1`](#StaticVectors_v1) if vectors are
+included.
+
+## Layers {#layers}
+
+These functions are available from `@spacy.registry.layers`.
+
+### spacy.StaticVectors.v1 {#StaticVectors_v1}
+
+Identical to [`spacy.StaticVectors.v2`](/api/architectures#StaticVectors) except
+for the handling of tokens without vectors.
+
+<Infobox title="Bugs for tokens without vectors" variant="warning">
+
+`spacy.StaticVectors.v1` maps tokens without vectors to the final row in the
+vectors table, which causes the model predictions to change if new vectors are
+added to an existing vectors table. See more details in
+[issue #7662](https://github.com/explosion/spaCy/issues/7662#issuecomment-813925655).
+
+</Infobox>
 
 ## Loggers {#loggers}
 
@@ -149,7 +182,7 @@ These functions are available from `@spacy.registry.loggers`.
 
 ### spacy.WandbLogger.v1 {#WandbLogger_v1}
 
-The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet 
+The first version of the [`WandbLogger`](/api/top-level#WandbLogger) did not yet
 support the `log_dataset_dir` and `model_log_interval` arguments.
 
 > #### Example config
@@ -160,7 +193,8 @@ support the `log_dataset_dir` and `model_log_interval` arguments.
 > project_name = "monitor_spacy_training"
 > remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
 > ```
-| Name                   | Description                                                                                                                           |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
-| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
+>
+> | Name                   | Description                                                                                                                           |
+> | ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+> | `project_name`         | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
+> | `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~                              |
diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index e71336e84..4113e9394 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -132,7 +132,7 @@ factory = "tok2vec"
 @architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 
 [components.tok2vec.model.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v2"
@@ -164,7 +164,7 @@ factory = "ner"
 @architectures = "spacy.Tok2Vec.v2"
 
 [components.ner.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 
 [components.ner.model.tok2vec.encode]
 @architectures = "spacy.MaxoutWindowEncoder.v2"
@@ -541,7 +541,7 @@ word vector tables using the `include_static_vectors` flag.
 
 ```ini
 [tagger.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 128
 attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
 rows = [5000,2500,2500,2500]
@@ -550,7 +550,7 @@ include_static_vectors = true
 
 <Infobox title="How it works" emoji="💡">
 
-The configuration system will look up the string `"spacy.MultiHashEmbed.v1"` in
+The configuration system will look up the string `"spacy.MultiHashEmbed.v2"` in
 the `architectures` [registry](/api/top-level#registry), and call the returned
 object with the rest of the arguments from the block. This will result in a call
 to the
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index 0bc935d51..8fe2cf489 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -137,7 +137,7 @@ nO = null
 @architectures = "spacy.Tok2Vec.v2"
 
 [components.textcat.model.tok2vec.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 width = 64
 rows = [2000, 2000, 1000, 1000, 1000, 1000]
 attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
@@ -204,7 +204,7 @@ factory = "tok2vec"
 @architectures = "spacy.Tok2Vec.v2"
 
 [components.tok2vec.model.embed]
-@architectures = "spacy.MultiHashEmbed.v1"
+@architectures = "spacy.MultiHashEmbed.v2"
 # ...
 
 [components.tok2vec.model.encode]
@@ -220,7 +220,7 @@ architecture:
 ```ini
 ### config.cfg (excerpt)
 [components.tok2vec.model.embed]
-@architectures = "spacy.CharacterEmbed.v1"
+@architectures = "spacy.CharacterEmbed.v2"
 # ...
 
 [components.tok2vec.model.encode]
@@ -638,7 +638,7 @@ that has the full implementation.
 > @architectures = "rel_instance_tensor.v1"
 >
 > [model.create_instance_tensor.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v1"
+> @architectures = "spacy.HashEmbedCNN.v2"
 > # ...
 >
 > [model.create_instance_tensor.pooling]

From cfad7e21d59d9f0866e6e21558bf2d209cc0cf02 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 22 Apr 2021 10:09:13 +0200
Subject: [PATCH 111/146] fix config parsing of ints/strings (#7755)

* add few failing tests for parsing integers and strings

* bump thinc to 8.0.3
---
 pyproject.toml          |  2 +-
 requirements.txt        |  2 +-
 setup.cfg               |  4 ++--
 spacy/tests/test_cli.py | 27 +++++++++++++++++++++++++--
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f00fdc9f4..3e34a0b2d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
     "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.2,<8.1.0",
+    "thinc>=8.0.3,<8.1.0",
     "blis>=0.4.0,<0.8.0",
     "pathy",
     "numpy>=1.15.0",
diff --git a/requirements.txt b/requirements.txt
index 91fac7894..517553241 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 spacy-legacy>=3.0.3,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.2,<8.1.0
+thinc>=8.0.3,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
diff --git a/setup.cfg b/setup.cfg
index c60d78fc4..ffdb8b2b8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,14 +34,14 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.2,<8.1.0
+    thinc>=8.0.3,<8.1.0
 install_requires =
     # Our libraries
     spacy-legacy>=3.0.3,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
-    thinc>=8.0.2,<8.1.0
+    thinc>=8.0.3,<8.1.0
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.1,<3.0.0
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index c36be9c57..2013ceac4 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -307,8 +307,11 @@ def test_project_config_validation2(config, n_errors):
     assert len(errors) == n_errors
 
 
-def test_project_config_interpolation():
-    variables = {"a": 10, "b": {"c": "foo", "d": True}}
+@pytest.mark.parametrize(
+    "int_value", [10, pytest.param("10", marks=pytest.mark.xfail)],
+)
+def test_project_config_interpolation(int_value):
+    variables = {"a": int_value, "b": {"c": "foo", "d": True}}
     commands = [
         {"name": "x", "script": ["hello ${vars.a} ${vars.b.c}"]},
         {"name": "y", "script": ["${vars.b.c} ${vars.b.d}"]},
@@ -317,6 +320,8 @@ def test_project_config_interpolation():
     with make_tempdir() as d:
         srsly.write_yaml(d / "project.yml", project)
         cfg = load_project_config(d)
+    assert type(cfg) == dict
+    assert type(cfg["commands"]) == list
     assert cfg["commands"][0]["script"][0] == "hello 10 foo"
     assert cfg["commands"][1]["script"][0] == "foo true"
     commands = [{"name": "x", "script": ["hello ${vars.a} ${vars.b.e}"]}]
@@ -325,6 +330,24 @@ def test_project_config_interpolation():
         substitute_project_variables(project)
 
 
+@pytest.mark.parametrize(
+    "greeting", [342, "everyone", "tout le monde", pytest.param("42", marks=pytest.mark.xfail)],
+)
+def test_project_config_interpolation_override(greeting):
+    variables = {"a": "world"}
+    commands = [
+        {"name": "x", "script": ["hello ${vars.a}"]},
+    ]
+    overrides = {"vars.a": greeting}
+    project = {"commands": commands, "vars": variables}
+    with make_tempdir() as d:
+        srsly.write_yaml(d / "project.yml", project)
+        cfg = load_project_config(d, overrides=overrides)
+    assert type(cfg) == dict
+    assert type(cfg["commands"]) == list
+    assert cfg["commands"][0]["script"][0] == f"hello {greeting}"
+
+
 def test_project_config_interpolation_env():
     variables = {"a": 10}
     env_var = "SPACY_TEST_FOO"

From f4339f9bff72391bb0a363f6f28aa47335f660eb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 22 Apr 2021 10:14:57 +0200
Subject: [PATCH 112/146] Fix tokenizer cache flushing (#7836)

* Fix tokenizer cache flushing

Fix/simplify tokenizer init detection in order to fix cache flushing
when properties are modified.

* Remove init reloading logic

* Remove logic disabling `_reload_special_cases` on init
  * Setting `rules` last in `__init__` (as before) means that setting
    other properties doesn't reload any special cases
  * Reset `rules` first in `from_bytes` so that setting other properties
    during deserialization doesn't reload any special cases
    unnecessarily
* Reset all properties in `Tokenizer.from_bytes` to allow any settings
  to be `None`

* Also reset special matcher when special cache is flushed

* Remove duplicate special case validation

* Add test for special cases flushing

* Extend test for tokenizer deserialization of None values
---
 .../serialize/test_serialize_tokenizer.py     |  4 ++
 spacy/tests/tokenizer/test_tokenizer.py       | 29 +++++++++++
 spacy/tokenizer.pxd                           |  4 +-
 spacy/tokenizer.pyx                           | 50 ++++++-------------
 4 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py
index ae612114a..a9450cd04 100644
--- a/spacy/tests/serialize/test_serialize_tokenizer.py
+++ b/spacy/tests/serialize/test_serialize_tokenizer.py
@@ -26,10 +26,14 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
     assert tokenizer.rules != {}
     assert tokenizer.token_match is not None
     assert tokenizer.url_match is not None
+    assert tokenizer.prefix_search is not None
+    assert tokenizer.infix_finditer is not None
     tokenizer.from_bytes(tokenizer_bytes)
     assert tokenizer.rules == {}
     assert tokenizer.token_match is None
     assert tokenizer.url_match is None
+    assert tokenizer.prefix_search is None
+    assert tokenizer.infix_finditer is None
 
     tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
     tokenizer.rules = {}
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 4f5eddb95..6cfeaf014 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -1,4 +1,5 @@
 import pytest
+import re
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.util import ensure_path
@@ -186,3 +187,31 @@ def test_tokenizer_special_cases_spaces(tokenizer):
     assert [t.text for t in tokenizer("a b c")] == ["a", "b", "c"]
     tokenizer.add_special_case("a b c", [{"ORTH": "a b c"}])
     assert [t.text for t in tokenizer("a b c")] == ["a b c"]
+
+
+def test_tokenizer_flush_cache(en_vocab):
+    suffix_re = re.compile(r"[\.]$")
+    tokenizer = Tokenizer(
+        en_vocab,
+        suffix_search=suffix_re.search,
+    )
+    assert [t.text for t in tokenizer("a.")] == ["a", "."]
+    tokenizer.suffix_search = None
+    assert [t.text for t in tokenizer("a.")] == ["a."]
+
+
+def test_tokenizer_flush_specials(en_vocab):
+    suffix_re = re.compile(r"[\.]$")
+    rules = {"a a": [{"ORTH": "a a"}]}
+    tokenizer1 = Tokenizer(
+        en_vocab,
+        suffix_search=suffix_re.search,
+        rules=rules,
+    )
+    tokenizer2 = Tokenizer(
+        en_vocab,
+        suffix_search=suffix_re.search,
+    )
+    assert [t.text for t in tokenizer1("a a.")] == ["a a", "."]
+    tokenizer1.rules = {}
+    assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."]
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 9c1398a17..2a44d7729 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -23,8 +23,8 @@ cdef class Tokenizer:
     cdef object _infix_finditer
     cdef object _rules
     cdef PhraseMatcher _special_matcher
-    cdef int _property_init_count
-    cdef int _property_init_max
+    cdef int _property_init_count  # TODO: unused, remove in v3.1
+    cdef int _property_init_max    # TODO: unused, remove in v3.1
 
     cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 41bbaeee6..61a7582b1 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -69,8 +69,6 @@ cdef class Tokenizer:
         self._rules = {}
         self._special_matcher = PhraseMatcher(self.vocab)
         self._load_special_cases(rules)
-        self._property_init_count = 0
-        self._property_init_max = 4
 
     property token_match:
         def __get__(self):
@@ -79,8 +77,6 @@ cdef class Tokenizer:
         def __set__(self, token_match):
             self._token_match = token_match
             self._reload_special_cases()
-            if self._property_init_count <= self._property_init_max:
-                self._property_init_count += 1
 
     property url_match:
         def __get__(self):
@@ -88,7 +84,7 @@ cdef class Tokenizer:
 
         def __set__(self, url_match):
             self._url_match = url_match
-            self._flush_cache()
+            self._reload_special_cases()
 
     property prefix_search:
         def __get__(self):
@@ -97,8 +93,6 @@ cdef class Tokenizer:
         def __set__(self, prefix_search):
             self._prefix_search = prefix_search
             self._reload_special_cases()
-            if self._property_init_count <= self._property_init_max:
-                self._property_init_count += 1
 
     property suffix_search:
         def __get__(self):
@@ -107,8 +101,6 @@ cdef class Tokenizer:
         def __set__(self, suffix_search):
             self._suffix_search = suffix_search
             self._reload_special_cases()
-            if self._property_init_count <= self._property_init_max:
-                self._property_init_count += 1
 
     property infix_finditer:
         def __get__(self):
@@ -117,8 +109,6 @@ cdef class Tokenizer:
         def __set__(self, infix_finditer):
             self._infix_finditer = infix_finditer
             self._reload_special_cases()
-            if self._property_init_count <= self._property_init_max:
-                self._property_init_count += 1
 
     property rules:
         def __get__(self):
@@ -126,7 +116,7 @@ cdef class Tokenizer:
 
         def __set__(self, rules):
             self._rules = {}
-            self._reset_cache([key for key in self._cache])
+            self._flush_cache()
             self._flush_specials()
             self._cache = PreshMap()
             self._specials = PreshMap()
@@ -226,6 +216,7 @@ cdef class Tokenizer:
                 self.mem.free(cached)
 
     def _flush_specials(self):
+        self._special_matcher = PhraseMatcher(self.vocab)
         for k in self._specials:
             cached = <_Cached*>self._specials.get(k)
             del self._specials[k]
@@ -568,7 +559,6 @@ cdef class Tokenizer:
         """Add special-case tokenization rules."""
         if special_cases is not None:
             for chunk, substrings in sorted(special_cases.items()):
-                self._validate_special_case(chunk, substrings)
                 self.add_special_case(chunk, substrings)
 
     def _validate_special_case(self, chunk, substrings):
@@ -616,16 +606,9 @@ cdef class Tokenizer:
             self._special_matcher.add(string, None, self._tokenize_affixes(string, False))
 
     def _reload_special_cases(self):
-        try:
-            self._property_init_count
-        except AttributeError:
-            return
-        # only reload if all 4 of prefix, suffix, infix, token_match have
-        # have been initialized
-        if self.vocab is not None and self._property_init_count >= self._property_init_max:
-            self._flush_cache()
-            self._flush_specials()
-            self._load_special_cases(self._rules)
+        self._flush_cache()
+        self._flush_specials()
+        self._load_special_cases(self._rules)
 
     def explain(self, text):
         """A debugging tokenizer that provides information about which
@@ -811,6 +794,15 @@ cdef class Tokenizer:
             "url_match": lambda b: data.setdefault("url_match", b),
             "exceptions": lambda b: data.setdefault("rules", b)
         }
+        # reset all properties and flush all caches (through rules),
+        # reset rules first so that _reload_special_cases is trivial/fast as
+        # the other properties are reset
+        self.rules = {}
+        self.prefix_search = None
+        self.suffix_search = None
+        self.infix_finditer = None
+        self.token_match = None
+        self.url_match = None
         msg = util.from_bytes(bytes_data, deserializers, exclude)
         if "prefix_search" in data and isinstance(data["prefix_search"], str):
             self.prefix_search = re.compile(data["prefix_search"]).search
@@ -818,22 +810,12 @@ cdef class Tokenizer:
             self.suffix_search = re.compile(data["suffix_search"]).search
         if "infix_finditer" in data and isinstance(data["infix_finditer"], str):
             self.infix_finditer = re.compile(data["infix_finditer"]).finditer
-        # for token_match and url_match, set to None to override the language
-        # defaults if no regex is provided
         if "token_match" in data and isinstance(data["token_match"], str):
             self.token_match = re.compile(data["token_match"]).match
-        else:
-            self.token_match = None
         if "url_match" in data and isinstance(data["url_match"], str):
             self.url_match = re.compile(data["url_match"]).match
-        else:
-            self.url_match = None
         if "rules" in data and isinstance(data["rules"], dict):
-            # make sure to hard reset the cache to remove data from the default exceptions
-            self._rules = {}
-            self._flush_cache()
-            self._flush_specials()
-            self._load_special_cases(data["rules"])
+            self.rules = data["rules"]
         return self
 
 

From f68fc29130f913f36e0f9339a902cee5f1ed7179 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 22 Apr 2021 11:32:45 +0200
Subject: [PATCH 113/146] Update sent_starts in Example.from_dict (#7847)

* Update sent_starts in Example.from_dict

Update `sent_starts` for `Example.from_dict` so that `Optional[bool]`
values have the same meaning as for `Token.is_sent_start`.

Use `Optional[bool]` as the type for sent start values in the docs.

* Use helper function for conversion to ternary ints
---
 spacy/tests/training/test_new_example.py | 10 ++++++++--
 spacy/training/example.pyx               |  4 ++--
 spacy/util.py                            | 15 +++++++++++++++
 website/docs/api/data-formats.md         |  2 +-
 website/docs/api/doc.md                  |  2 +-
 website/docs/api/token.md                |  2 +-
 6 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py
index b8fbaf606..ba58ea96d 100644
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -2,6 +2,7 @@ import pytest
 from spacy.training.example import Example
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
+from spacy.util import to_ternary_int
 
 
 def test_Example_init_requires_doc_objects():
@@ -121,7 +122,7 @@ def test_Example_from_dict_with_morphology(annots):
     [
         {
             "words": ["This", "is", "one", "sentence", "this", "is", "another"],
-            "sent_starts": [1, 0, 0, 0, 1, 0, 0],
+            "sent_starts": [1, False, 0, None, True, -1, -5.7],
         }
     ],
 )
@@ -131,7 +132,12 @@ def test_Example_from_dict_with_sent_start(annots):
     example = Example.from_dict(predicted, annots)
     assert len(list(example.reference.sents)) == 2
     for i, token in enumerate(example.reference):
-        assert bool(token.is_sent_start) == bool(annots["sent_starts"][i])
+        if to_ternary_int(annots["sent_starts"][i]) == 1:
+            assert token.is_sent_start is True
+        elif to_ternary_int(annots["sent_starts"][i]) == 0:
+            assert token.is_sent_start is None
+        else:
+            assert token.is_sent_start is False
 
 
 @pytest.mark.parametrize(
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 74af793bd..07a83bfec 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -13,7 +13,7 @@ from .iob_utils import biluo_tags_to_spans
 from ..errors import Errors, Warnings
 from ..pipeline._parser_internals import nonproj
 from ..tokens.token cimport MISSING_DEP
-from ..util import logger
+from ..util import logger, to_ternary_int
 
 
 cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
@@ -338,7 +338,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
             values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
         elif key == "SENT_START":
             attrs.append(key)
-            values.append(value)
+            values.append([to_ternary_int(v) for v in value])
         elif key == "MORPH":
             attrs.append(key)
             values.append([vocab.morphology.add(v) for v in value])
diff --git a/spacy/util.py b/spacy/util.py
index d8854f68d..512c6b742 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1526,3 +1526,18 @@ def check_lexeme_norms(vocab, component_name):
     if len(lexeme_norms) == 0 and vocab.lang in LEXEME_NORM_LANGS:
         langs = ", ".join(LEXEME_NORM_LANGS)
         logger.debug(Warnings.W033.format(model=component_name, langs=langs))
+
+
+def to_ternary_int(val) -> int:
+    """Convert a value to the ternary 1/0/-1 int used for True/None/False in
+    attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
+    (None), any other values are -1 (False).
+    """
+    if isinstance(val, float):
+        val = int(val)
+    if val is True or val is 1:
+        return 1
+    elif val is None or val is 0:
+        return 0
+    else:
+        return -1
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 69bdae446..0c2a4c9f3 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -390,7 +390,7 @@ file to keep track of your settings and hyperparameters and your own
 >    "tags": List[str],
 >    "pos": List[str],
 >    "morphs": List[str],
->    "sent_starts": List[bool],
+>    "sent_starts": List[Optional[bool]],
 >    "deps": List[string],
 >    "heads": List[int],
 >    "entities": List[str],
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index c8917efa1..9358507dc 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -44,7 +44,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | `lemmas` <Tag variant="new">3</Tag>      | A list of strings, of the same length as `words`, to assign as `token.lemma` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                            |
 | `heads` <Tag variant="new">3</Tag>       | A list of values, of the same length as `words`, to assign as the head for each word. Head indices are the absolute position of the head in the `Doc`. Defaults to `None`. ~~Optional[List[int]]~~ |
 | `deps` <Tag variant="new">3</Tag>        | A list of strings, of the same length as `words`, to assign as `token.dep` for each word. Defaults to `None`. ~~Optional[List[str]]~~                                                              |
-| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Union[bool, None]]~~  |
+| `sent_starts` <Tag variant="new">3</Tag> | A list of values, of the same length as `words`, to assign as `token.is_sent_start`. Will be overridden by heads if `heads` is provided. Defaults to `None`. ~~Optional[List[Optional[bool]]]~~    |
 | `ents` <Tag variant="new">3</Tag>        | A list of strings, of the same length of `words`, to assign the token-based IOB tag. Defaults to `None`. ~~Optional[List[str]]~~                                                                   |
 
 ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
diff --git a/website/docs/api/token.md b/website/docs/api/token.md
index 687705524..ecf7bcc8e 100644
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@@ -364,7 +364,7 @@ unknown. Defaults to `True` for the first token in the `Doc`.
 
 | Name        | Description                                   |
 | ----------- | --------------------------------------------- |
-| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
+| **RETURNS** | Whether the token starts a sentence. ~~Optional[bool]~~ |
 
 ## Token.has_vector {#has_vector tag="property" model="vectors"}
 

From bdb485cc80ca6822471eac0a29ac2782d9b55450 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 22 Apr 2021 12:36:50 +0200
Subject: [PATCH 114/146] Add callback to copy vocab/tokenizer from model
 (#7750)

* Add callback to copy vocab/tokenizer from model

Add callback `spacy.copy_from_base_model.v1` to copy the tokenizer
settings and/or vocab (including vectors) from a base model.

* Move spacy.copy_from_base_model.v1 to spacy.training.callbacks

* Add documentation

* Modify to specify model as tokenizer and vocab params
---
 spacy/errors.py               |  3 +++
 spacy/training/__init__.py    |  1 +
 spacy/training/callbacks.py   | 32 ++++++++++++++++++++++++++++++++
 website/docs/api/top-level.md | 30 ++++++++++++++++++++++++++++++
 4 files changed, 66 insertions(+)
 create mode 100644 spacy/training/callbacks.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 453e98b59..7cf9e54e4 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -501,6 +501,9 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
+    E872 = ("Unable to copy tokenizer from base model due to different "
+            'tokenizer settings: current tokenizer config "{curr_config}" '
+            'vs. base model "{base_config}"')
     E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
             "'{text}'. This is likely a bug in spaCy, so feel free to open an "
             "issue: https://github.com/explosion/spaCy/issues")
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 5111b80dc..055f30f42 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -8,3 +8,4 @@ from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .loggers import console_logger, wandb_logger  # noqa: F401
+from .callbacks import create_copy_from_base_model  # noqa: F401
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
new file mode 100644
index 000000000..2a21be98c
--- /dev/null
+++ b/spacy/training/callbacks.py
@@ -0,0 +1,32 @@
+from typing import Optional
+from ..errors import Errors
+from ..language import Language
+from ..util import load_model, registry, logger
+
+
+@registry.callbacks("spacy.copy_from_base_model.v1")
+def create_copy_from_base_model(
+    tokenizer: Optional[str] = None,
+    vocab: Optional[str] = None,
+) -> Language:
+    def copy_from_base_model(nlp):
+        if tokenizer:
+            logger.info(f"Copying tokenizer from: {tokenizer}")
+            base_nlp = load_model(tokenizer)
+            if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
+                nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
+            else:
+                raise ValueError(
+                    Errors.E872.format(
+                        curr_config=nlp.config["nlp"]["tokenizer"],
+                        base_config=base_nlp.config["nlp"]["tokenizer"],
+                    )
+                )
+        if vocab:
+            logger.info(f"Copying vocab from: {vocab}")
+            # only reload if the vocab is from a different model
+            if tokenizer != vocab:
+                base_nlp = load_model(vocab)
+            nlp.vocab.from_bytes(base_nlp.vocab.to_bytes())
+
+    return copy_from_base_model
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 38bc40b11..cfaa75bff 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -8,6 +8,7 @@ menu:
   - ['Readers', 'readers']
   - ['Batchers', 'batchers']
   - ['Augmenters', 'augmenters']
+  - ['Callbacks', 'callbacks']
   - ['Training & Alignment', 'gold']
   - ['Utility Functions', 'util']
 ---
@@ -785,6 +786,35 @@ useful for making the model less sensitive to capitalization.
 | `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                    |
 | **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
 
+## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"}
+
+The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at
+several points in the lifecycle that can be used modify the `nlp` object.
+
+### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.before_init]
+> @callbacks = "spacy.copy_from_base_model.v1"
+> tokenizer = "en_core_sci_md"
+> vocab = "en_core_sci_md"
+> ```
+
+Copy the tokenizer and/or vocab from the specified models. It's similar to the
+v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in
+combination with
+[sourced components](/usage/processing-pipelines#sourced-components) when
+fine-tuning an existing pipeline. The vocab includes the lookups and the vectors
+from the specified model. Intended for use in `[initialize.before_init]`.
+
+| Name        | Description                                                                                                             |
+| ----------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~                                          |
+| `vocab`     | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~  |
+| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ |
+
 ## Training data and alignment {#gold source="spacy/training"}
 
 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}

From 36ecba224e2111d39a0b45b3019c63d2dd1529f0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 22 Apr 2021 14:58:29 +0200
Subject: [PATCH 115/146] Set up GPU CI testing (#7293)

* Set up CI for tests with GPU agent

* Update tests for enabled GPU

* Fix steps filename

* Add parallel build jobs as a setting

* Fix test requirements

* Fix install test requirements condition

* Fix pipeline models test

* Reset current ops in prefer/require testing

* Fix more tests

* Remove separate test_models test

* Fix regression 5551

* fix StaticVectors for GPU use

* fix vocab tests

* Fix regression test 5082

* Move azure steps to .github and reenable default pool jobs

* Consolidate/rename azure steps

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
---
 .github/azure-steps.yml                       | 57 ++++++++++++++++++
 azure-pipelines.yml                           | 53 ++++++-----------
 spacy/ml/staticvectors.py                     |  2 +-
 spacy/tests/enable_gpu.py                     |  3 +
 spacy/tests/pipeline/test_entity_ruler.py     | 16 ++---
 spacy/tests/pipeline/test_models.py           | 12 ++--
 spacy/tests/pipeline/test_textcat.py          | 18 ++++--
 spacy/tests/pipeline/test_tok2vec.py          |  7 ++-
 spacy/tests/regression/test_issue4501-5000.py | 50 ++++++++--------
 spacy/tests/regression/test_issue5001-5500.py | 16 ++---
 spacy/tests/regression/test_issue5501-6000.py | 10 ++--
 spacy/tests/test_language.py                  | 31 ++++++----
 spacy/tests/test_misc.py                      |  9 ++-
 spacy/tests/test_models.py                    | 26 ++++++--
 spacy/tests/util.py                           |  6 +-
 spacy/tests/vocab_vectors/test_vectors.py     | 59 ++++++++++---------
 16 files changed, 238 insertions(+), 137 deletions(-)
 create mode 100644 .github/azure-steps.yml
 create mode 100644 spacy/tests/enable_gpu.py

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
new file mode 100644
index 000000000..750e096d0
--- /dev/null
+++ b/.github/azure-steps.yml
@@ -0,0 +1,57 @@
+parameters:
+  python_version: ''
+  architecture: ''
+  prefix: ''
+  gpu: false
+  num_build_jobs: 1
+
+steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: ${{ parameters.python_version }}
+      architecture: ${{ parameters.architecture }}
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U pip setuptools
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install dependencies"
+
+  - script: |
+      ${{ parameters.prefix }} python setup.py build_ext --inplace -j ${{ parameters.num_build_jobs }}
+      ${{ parameters.prefix }} python setup.py sdist --formats=gztar
+    displayName: "Compile and build sdist"
+
+  - task: DeleteFiles@1
+    inputs:
+      contents: "spacy"
+    displayName: "Delete source directory"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip freeze --exclude torch --exclude cupy-cuda110 > installed.txt
+      ${{ parameters.prefix }} python -m pip uninstall -y -r installed.txt
+    displayName: "Uninstall all packages"
+
+  - bash: |
+      ${{ parameters.prefix }} SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
+      ${{ parameters.prefix }} python -m pip install dist/$SDIST
+    displayName: "Install from sdist"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U -r requirements.txt
+    displayName: "Install test requirements"
+
+  - script: |
+      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110
+      ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
+    displayName: "Install GPU requirements"
+    condition: eq(${{ parameters.gpu }}, true)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy
+    displayName: "Run CPU tests"
+    condition: eq(${{ parameters.gpu }}, false)
+
+  - script: |
+      ${{ parameters.prefix }} python -m pytest --pyargs spacy -p spacy.tests.enable_gpu
+    displayName: "Run GPU tests"
+    condition: eq(${{ parameters.gpu }}, true)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index bb259dded..bea65cae2 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -76,39 +76,24 @@ jobs:
       maxParallel: 4
     pool:
       vmImage: $(imageName)
-
     steps:
-      - task: UsePythonVersion@0
-        inputs:
-          versionSpec: "$(python.version)"
-          architecture: "x64"
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'
+          architecture: 'x64'
 
-      - script: |
-          python -m pip install -U setuptools
-          pip install -r requirements.txt
-        displayName: "Install dependencies"
-
-      - script: |
-          python setup.py build_ext --inplace
-          python setup.py sdist --formats=gztar
-        displayName: "Compile and build sdist"
-
-      - task: DeleteFiles@1
-        inputs:
-          contents: "spacy"
-        displayName: "Delete source directory"
-
-      - script: |
-          pip freeze > installed.txt
-          pip uninstall -y -r installed.txt
-        displayName: "Uninstall all packages"
-
-      - bash: |
-          SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1)
-          pip install dist/$SDIST
-        displayName: "Install from sdist"
-
-      - script: |
-          pip install -r requirements.txt
-          python -m pytest --pyargs spacy
-        displayName: "Run tests"
+  - job: "TestGPU"
+    dependsOn: "Validate"
+    strategy:
+      matrix:
+        Python38LinuxX64_GPU:
+          python.version: '3.8'
+    pool:
+      name: "LinuxX64_GPU"
+    steps:
+      - template: .github/azure-steps.yml
+        parameters:
+          python_version: '$(python.version)'
+          architecture: 'x64'
+          gpu: true
+          num_build_jobs: 24
diff --git a/spacy/ml/staticvectors.py b/spacy/ml/staticvectors.py
index cfd25c24b..4e7262e7d 100644
--- a/spacy/ml/staticvectors.py
+++ b/spacy/ml/staticvectors.py
@@ -38,7 +38,7 @@ def forward(
         return _handle_empty(model.ops, model.get_dim("nO"))
     key_attr = model.attrs["key_attr"]
     W = cast(Floats2d, model.ops.as_contig(model.get_param("W")))
-    V = cast(Floats2d, docs[0].vocab.vectors.data)
+    V = cast(Floats2d, model.ops.asarray(docs[0].vocab.vectors.data))
     rows = model.ops.flatten(
         [doc.vocab.vectors.find(keys=doc.to_array(key_attr)) for doc in docs]
     )
diff --git a/spacy/tests/enable_gpu.py b/spacy/tests/enable_gpu.py
new file mode 100644
index 000000000..3d4fded10
--- /dev/null
+++ b/spacy/tests/enable_gpu.py
@@ -0,0 +1,3 @@
+from spacy import require_gpu
+
+require_gpu()
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 3f998d78d..2f6da79d6 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -5,6 +5,7 @@ from spacy.tokens import Span
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
 from spacy.errors import MatchPatternError
+from thinc.api import NumpyOps, get_current_ops
 
 
 @pytest.fixture
@@ -201,13 +202,14 @@ def test_entity_ruler_overlapping_spans(nlp):
 
 @pytest.mark.parametrize("n_process", [1, 2])
 def test_entity_ruler_multiprocessing(nlp, n_process):
-    texts = ["I enjoy eating Pizza Hut pizza."]
+    if isinstance(get_current_ops, NumpyOps) or n_process < 2:
+        texts = ["I enjoy eating Pizza Hut pizza."]
 
-    patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
+        patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut", "id": "1234"}]
 
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
+        ruler = nlp.add_pipe("entity_ruler")
+        ruler.add_patterns(patterns)
 
-    for doc in nlp.pipe(texts, n_process=2):
-        for ent in doc.ents:
-            assert ent.ent_id_ == "1234"
+        for doc in nlp.pipe(texts, n_process=2):
+            for ent in doc.ents:
+                assert ent.ent_id_ == "1234"
diff --git a/spacy/tests/pipeline/test_models.py b/spacy/tests/pipeline/test_models.py
index d04ac9cd4..302c307e2 100644
--- a/spacy/tests/pipeline/test_models.py
+++ b/spacy/tests/pipeline/test_models.py
@@ -4,7 +4,7 @@ import numpy
 import pytest
 from numpy.testing import assert_almost_equal
 from spacy.vocab import Vocab
-from thinc.api import NumpyOps, Model, data_validation
+from thinc.api import Model, data_validation, get_current_ops
 from thinc.types import Array2d, Ragged
 
 from spacy.lang.en import English
@@ -13,7 +13,7 @@ from spacy.ml._character_embed import CharacterEmbed
 from spacy.tokens import Doc
 
 
-OPS = NumpyOps()
+OPS = get_current_ops()
 
 texts = ["These are 4 words", "Here just three"]
 l0 = [[1, 2], [3, 4], [5, 6], [7, 8]]
@@ -82,7 +82,7 @@ def util_batch_unbatch_docs_list(
         Y_batched = model.predict(in_data)
         Y_not_batched = [model.predict([u])[0] for u in in_data]
         for i in range(len(Y_batched)):
-            assert_almost_equal(Y_batched[i], Y_not_batched[i], decimal=4)
+            assert_almost_equal(OPS.to_numpy(Y_batched[i]), OPS.to_numpy(Y_not_batched[i]), decimal=4)
 
 
 def util_batch_unbatch_docs_array(
@@ -91,7 +91,7 @@ def util_batch_unbatch_docs_array(
     with data_validation(True):
         model.initialize(in_data, out_data)
         Y_batched = model.predict(in_data).tolist()
-        Y_not_batched = [model.predict([u])[0] for u in in_data]
+        Y_not_batched = [model.predict([u])[0].tolist() for u in in_data]
         assert_almost_equal(Y_batched, Y_not_batched, decimal=4)
 
 
@@ -100,8 +100,8 @@ def util_batch_unbatch_docs_ragged(
 ):
     with data_validation(True):
         model.initialize(in_data, out_data)
-        Y_batched = model.predict(in_data)
+        Y_batched = model.predict(in_data).data.tolist()
         Y_not_batched = []
         for u in in_data:
             Y_not_batched.extend(model.predict([u]).data.tolist())
-        assert_almost_equal(Y_batched.data, Y_not_batched, decimal=4)
+        assert_almost_equal(Y_batched, Y_not_batched, decimal=4)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 61af16eb5..43dfff147 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -1,7 +1,7 @@
 import pytest
 import random
 import numpy.random
-from numpy.testing import assert_equal
+from numpy.testing import assert_almost_equal
 from thinc.api import fix_random_seed
 from spacy import util
 from spacy.lang.en import English
@@ -222,8 +222,12 @@ def test_overfitting_IO():
     batch_cats_1 = [doc.cats for doc in nlp.pipe(texts)]
     batch_cats_2 = [doc.cats for doc in nlp.pipe(texts)]
     no_batch_cats = [doc.cats for doc in [nlp(text) for text in texts]]
-    assert_equal(batch_cats_1, batch_cats_2)
-    assert_equal(batch_cats_1, no_batch_cats)
+    for cats_1, cats_2 in zip(batch_cats_1, batch_cats_2):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
+    for cats_1, cats_2 in zip(batch_cats_1, no_batch_cats):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
 
 
 def test_overfitting_IO_multi():
@@ -270,8 +274,12 @@ def test_overfitting_IO_multi():
     batch_deps_1 = [doc.cats for doc in nlp.pipe(texts)]
     batch_deps_2 = [doc.cats for doc in nlp.pipe(texts)]
     no_batch_deps = [doc.cats for doc in [nlp(text) for text in texts]]
-    assert_equal(batch_deps_1, batch_deps_2)
-    assert_equal(batch_deps_1, no_batch_deps)
+    for cats_1, cats_2 in zip(batch_deps_1, batch_deps_2):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
+    for cats_1, cats_2 in zip(batch_deps_1, no_batch_deps):
+        for cat in cats_1:
+            assert_almost_equal(cats_1[cat], cats_2[cat], decimal=5)
 
 
 # fmt: off
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index ac5428de6..e3b71c502 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -8,8 +8,8 @@ from spacy.tokens import Doc
 from spacy.training import Example
 from spacy import util
 from spacy.lang.en import English
-from thinc.api import Config
-from numpy.testing import assert_equal
+from thinc.api import Config, get_current_ops
+from numpy.testing import assert_array_equal
 
 from ..util import get_batch, make_tempdir
 
@@ -160,7 +160,8 @@ def test_tok2vec_listener():
 
     doc = nlp("Running the pipeline as a whole.")
     doc_tensor = tagger_tok2vec.predict([doc])[0]
-    assert_equal(doc.tensor, doc_tensor)
+    ops = get_current_ops()
+    assert_array_equal(ops.to_numpy(doc.tensor), ops.to_numpy(doc_tensor))
 
     # TODO: should this warn or error?
     nlp.select_pipes(disable="tok2vec")
diff --git a/spacy/tests/regression/test_issue4501-5000.py b/spacy/tests/regression/test_issue4501-5000.py
index 6dbbc233b..f5fcb53fd 100644
--- a/spacy/tests/regression/test_issue4501-5000.py
+++ b/spacy/tests/regression/test_issue4501-5000.py
@@ -9,6 +9,7 @@ from spacy.language import Language
 from spacy.util import ensure_path, load_model_from_path
 import numpy
 import pickle
+from thinc.api import NumpyOps, get_current_ops
 
 from ..util import make_tempdir
 
@@ -169,21 +170,22 @@ def test_issue4725_1():
 
 
 def test_issue4725_2():
-    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
-    # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
-    # or because of issues with pickling the NER (cf test_issue4725_1)
-    vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = 1.0
-    data[1] = 2.0
-    vocab.set_vector("cat", data[0])
-    vocab.set_vector("dog", data[1])
-    nlp = English(vocab=vocab)
-    nlp.add_pipe("ner")
-    nlp.initialize()
-    docs = ["Kurt is in London."] * 10
-    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
-        pass
+    if isinstance(get_current_ops, NumpyOps):
+        # ensures that this runs correctly and doesn't hang or crash because of the global vectors
+        # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
+        # or because of issues with pickling the NER (cf test_issue4725_1)
+        vocab = Vocab(vectors_name="test_vocab_add_vector")
+        data = numpy.ndarray((5, 3), dtype="f")
+        data[0] = 1.0
+        data[1] = 2.0
+        vocab.set_vector("cat", data[0])
+        vocab.set_vector("dog", data[1])
+        nlp = English(vocab=vocab)
+        nlp.add_pipe("ner")
+        nlp.initialize()
+        docs = ["Kurt is in London."] * 10
+        for _ in nlp.pipe(docs, batch_size=2, n_process=2):
+            pass
 
 
 def test_issue4849():
@@ -204,10 +206,11 @@ def test_issue4849():
         count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
     assert count_ents == 2
     # USING 2 PROCESSES
-    count_ents = 0
-    for doc in nlp.pipe([text], n_process=2):
-        count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
-    assert count_ents == 2
+    if isinstance(get_current_ops, NumpyOps):
+        count_ents = 0
+        for doc in nlp.pipe([text], n_process=2):
+            count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
+        assert count_ents == 2
 
 
 @Language.factory("my_pipe")
@@ -239,10 +242,11 @@ def test_issue4903():
     nlp.add_pipe("sentencizer")
     nlp.add_pipe("my_pipe", after="sentencizer")
     text = ["I like bananas.", "Do you like them?", "No, I prefer wasabi."]
-    docs = list(nlp.pipe(text, n_process=2))
-    assert docs[0].text == "I like bananas."
-    assert docs[1].text == "Do you like them?"
-    assert docs[2].text == "No, I prefer wasabi."
+    if isinstance(get_current_ops(), NumpyOps):
+        docs = list(nlp.pipe(text, n_process=2))
+        assert docs[0].text == "I like bananas."
+        assert docs[1].text == "Do you like them?"
+        assert docs[2].text == "No, I prefer wasabi."
 
 
 def test_issue4924():
diff --git a/spacy/tests/regression/test_issue5001-5500.py b/spacy/tests/regression/test_issue5001-5500.py
index dbfe78679..0575c8270 100644
--- a/spacy/tests/regression/test_issue5001-5500.py
+++ b/spacy/tests/regression/test_issue5001-5500.py
@@ -6,6 +6,7 @@ from spacy.language import Language
 from spacy.lang.en.syntax_iterators import noun_chunks
 from spacy.vocab import Vocab
 import spacy
+from thinc.api import get_current_ops
 import pytest
 
 from ...util import make_tempdir
@@ -54,16 +55,17 @@ def test_issue5082():
     ruler.add_patterns(patterns)
     parsed_vectors_1 = [t.vector for t in nlp(text)]
     assert len(parsed_vectors_1) == 4
-    numpy.testing.assert_array_equal(parsed_vectors_1[0], array1)
-    numpy.testing.assert_array_equal(parsed_vectors_1[1], array2)
-    numpy.testing.assert_array_equal(parsed_vectors_1[2], array3)
-    numpy.testing.assert_array_equal(parsed_vectors_1[3], array4)
+    ops = get_current_ops()
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[0]), array1)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[1]), array2)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[2]), array3)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_1[3]), array4)
     nlp.add_pipe("merge_entities")
     parsed_vectors_2 = [t.vector for t in nlp(text)]
     assert len(parsed_vectors_2) == 3
-    numpy.testing.assert_array_equal(parsed_vectors_2[0], array1)
-    numpy.testing.assert_array_equal(parsed_vectors_2[1], array2)
-    numpy.testing.assert_array_equal(parsed_vectors_2[2], array34)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[0]), array1)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[1]), array2)
+    numpy.testing.assert_array_equal(ops.to_numpy(parsed_vectors_2[2]), array34)
 
 
 def test_issue5137():
diff --git a/spacy/tests/regression/test_issue5501-6000.py b/spacy/tests/regression/test_issue5501-6000.py
index 8d1199e98..a35de92fa 100644
--- a/spacy/tests/regression/test_issue5501-6000.py
+++ b/spacy/tests/regression/test_issue5501-6000.py
@@ -1,5 +1,6 @@
 import pytest
-from thinc.api import Config, fix_random_seed
+from numpy.testing import assert_almost_equal
+from thinc.api import Config, fix_random_seed, get_current_ops
 
 from spacy.lang.en import English
 from spacy.pipeline.textcat import single_label_default_config, single_label_bow_config
@@ -44,11 +45,12 @@ def test_issue5551(textcat_config):
         nlp.update([Example.from_dict(doc, annots)])
         # Store the result of each iteration
         result = pipe.model.predict([doc])
-        results.append(list(result[0]))
+        results.append(result[0])
     # All results should be the same because of the fixed seed
     assert len(results) == 3
-    assert results[0] == results[1]
-    assert results[0] == results[2]
+    ops = get_current_ops()
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[1]))
+    assert_almost_equal(ops.to_numpy(results[0]), ops.to_numpy(results[2]))
 
 
 def test_issue5838():
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index bec85a1a2..7fb03da0c 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -10,6 +10,7 @@ from spacy.lang.en import English
 from spacy.lang.de import German
 from spacy.util import registry, ignore_error, raise_error
 import spacy
+from thinc.api import NumpyOps, get_current_ops
 
 from .util import add_vecs_to_vocab, assert_docs_equal
 
@@ -142,25 +143,29 @@ def texts():
 
 @pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe(nlp2, n_process, texts):
-    texts = texts * 10
-    expecteds = [nlp2(text) for text in texts]
-    docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
+    ops = get_current_ops()
+    if isinstance(ops, NumpyOps) or n_process < 2:
+        texts = texts * 10
+        expecteds = [nlp2(text) for text in texts]
+        docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
 
-    for doc, expected_doc in zip(docs, expecteds):
-        assert_docs_equal(doc, expected_doc)
+        for doc, expected_doc in zip(docs, expecteds):
+            assert_docs_equal(doc, expected_doc)
 
 
 @pytest.mark.parametrize("n_process", [1, 2])
 def test_language_pipe_stream(nlp2, n_process, texts):
-    # check if nlp.pipe can handle infinite length iterator properly.
-    stream_texts = itertools.cycle(texts)
-    texts0, texts1 = itertools.tee(stream_texts)
-    expecteds = (nlp2(text) for text in texts0)
-    docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
+    ops = get_current_ops()
+    if isinstance(ops, NumpyOps) or n_process < 2:
+        # check if nlp.pipe can handle infinite length iterator properly.
+        stream_texts = itertools.cycle(texts)
+        texts0, texts1 = itertools.tee(stream_texts)
+        expecteds = (nlp2(text) for text in texts0)
+        docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
 
-    n_fetch = 20
-    for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
-        assert_docs_equal(doc, expected_doc)
+        n_fetch = 20
+        for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
+            assert_docs_equal(doc, expected_doc)
 
 
 def test_language_pipe_error_handler():
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 58bebc4ca..0d09999a9 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -8,7 +8,8 @@ from spacy import prefer_gpu, require_gpu, require_cpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from spacy.util import dot_to_object, SimpleFrozenList, import_file
-from thinc.api import Config, Optimizer, ConfigValidationError
+from thinc.api import Config, Optimizer, ConfigValidationError, get_current_ops
+from thinc.api import set_current_ops
 from spacy.training.batchers import minibatch_by_words
 from spacy.lang.en import English
 from spacy.lang.nl import Dutch
@@ -81,6 +82,7 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
 
 
 def test_prefer_gpu():
+    current_ops = get_current_ops()
     try:
         import cupy  # noqa: F401
 
@@ -88,9 +90,11 @@ def test_prefer_gpu():
         assert isinstance(get_current_ops(), CupyOps)
     except ImportError:
         assert not prefer_gpu()
+    set_current_ops(current_ops)
 
 
 def test_require_gpu():
+    current_ops = get_current_ops()
     try:
         import cupy  # noqa: F401
 
@@ -99,9 +103,11 @@ def test_require_gpu():
     except ImportError:
         with pytest.raises(ValueError):
             require_gpu()
+    set_current_ops(current_ops)
 
 
 def test_require_cpu():
+    current_ops = get_current_ops()
     require_cpu()
     assert isinstance(get_current_ops(), NumpyOps)
     try:
@@ -113,6 +119,7 @@ def test_require_cpu():
         pass
     require_cpu()
     assert isinstance(get_current_ops(), NumpyOps)
+    set_current_ops(current_ops)
 
 
 def test_ascii_filenames():
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index 200d7dcfd..45cee13ea 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -1,7 +1,7 @@
 from typing import List
 import pytest
 from thinc.api import fix_random_seed, Adam, set_dropout_rate
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_array_almost_equal
 import numpy
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
 from spacy.ml.models import build_bow_text_classifier, build_simple_cnn_text_classifier
@@ -109,7 +109,7 @@ def test_models_initialize_consistently(seed, model_func, kwargs):
     model2.initialize()
     params1 = get_all_params(model1)
     params2 = get_all_params(model2)
-    assert_array_equal(params1, params2)
+    assert_array_equal(model1.ops.to_numpy(params1), model2.ops.to_numpy(params2))
 
 
 @pytest.mark.parametrize(
@@ -134,14 +134,25 @@ def test_models_predict_consistently(seed, model_func, kwargs, get_X):
         for i in range(len(tok2vec1)):
             for j in range(len(tok2vec1[i])):
                 assert_array_equal(
-                    numpy.asarray(tok2vec1[i][j]), numpy.asarray(tok2vec2[i][j])
+                    numpy.asarray(model1.ops.to_numpy(tok2vec1[i][j])),
+                    numpy.asarray(model2.ops.to_numpy(tok2vec2[i][j])),
                 )
 
+    try:
+        Y1 = model1.ops.to_numpy(Y1)
+        Y2 = model2.ops.to_numpy(Y2)
+    except Exception:
+        pass
     if isinstance(Y1, numpy.ndarray):
         assert_array_equal(Y1, Y2)
     elif isinstance(Y1, List):
         assert len(Y1) == len(Y2)
         for y1, y2 in zip(Y1, Y2):
+            try:
+                y1 = model1.ops.to_numpy(y1)
+                y2 = model2.ops.to_numpy(y2)
+            except Exception:
+                pass
             assert_array_equal(y1, y2)
     else:
         raise ValueError(f"Could not compare type {type(Y1)}")
@@ -169,12 +180,17 @@ def test_models_update_consistently(seed, dropout, model_func, kwargs, get_X):
             model.finish_update(optimizer)
         updated_params = get_all_params(model)
         with pytest.raises(AssertionError):
-            assert_array_equal(initial_params, updated_params)
+            assert_array_equal(
+                model.ops.to_numpy(initial_params), model.ops.to_numpy(updated_params)
+            )
         return model
 
     model1 = get_updated_model()
     model2 = get_updated_model()
-    assert_array_equal(get_all_params(model1), get_all_params(model2))
+    assert_array_almost_equal(
+        model1.ops.to_numpy(get_all_params(model1)),
+        model2.ops.to_numpy(get_all_params(model2)),
+    )
 
 
 @pytest.mark.parametrize("model_func,kwargs", [(StaticVectors, {"nO": 128, "nM": 300})])
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index ef7b4d00d..365ea4349 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -5,6 +5,7 @@ import srsly
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 from spacy.util import make_tempdir  # noqa: F401
+from thinc.api import get_current_ops
 
 
 @contextlib.contextmanager
@@ -58,7 +59,10 @@ def add_vecs_to_vocab(vocab, vectors):
 
 def get_cosine(vec1, vec2):
     """Get cosine for two given vectors"""
-    return numpy.dot(vec1, vec2) / (numpy.linalg.norm(vec1) * numpy.linalg.norm(vec2))
+    OPS = get_current_ops()
+    v1 = OPS.to_numpy(OPS.asarray(vec1))
+    v2 = OPS.to_numpy(OPS.asarray(vec2))
+    return numpy.dot(v1, v2) / (numpy.linalg.norm(v1) * numpy.linalg.norm(v2))
 
 
 def assert_docs_equal(doc1, doc2):
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 4257022ea..37d48ad0f 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy
 from numpy.testing import assert_allclose, assert_equal
+from thinc.api import get_current_ops
 from spacy.vocab import Vocab
 from spacy.vectors import Vectors
 from spacy.tokenizer import Tokenizer
@@ -9,6 +10,7 @@ from spacy.tokens import Doc
 
 from ..util import add_vecs_to_vocab, get_cosine, make_tempdir
 
+OPS = get_current_ops()
 
 @pytest.fixture
 def strings():
@@ -18,21 +20,21 @@ def strings():
 @pytest.fixture
 def vectors():
     return [
-        ("apple", [1, 2, 3]),
-        ("orange", [-1, -2, -3]),
-        ("and", [-1, -1, -1]),
-        ("juice", [5, 5, 10]),
-        ("pie", [7, 6.3, 8.9]),
+        ("apple", OPS.asarray([1, 2, 3])),
+        ("orange", OPS.asarray([-1, -2, -3])),
+        ("and", OPS.asarray([-1, -1, -1])),
+        ("juice", OPS.asarray([5, 5, 10])),
+        ("pie", OPS.asarray([7, 6.3, 8.9])),
     ]
 
 
 @pytest.fixture
 def ngrams_vectors():
     return [
-        ("apple", [1, 2, 3]),
-        ("app", [-0.1, -0.2, -0.3]),
-        ("ppl", [-0.2, -0.3, -0.4]),
-        ("pl", [0.7, 0.8, 0.9]),
+        ("apple", OPS.asarray([1, 2, 3])),
+        ("app", OPS.asarray([-0.1, -0.2, -0.3])),
+        ("ppl", OPS.asarray([-0.2, -0.3, -0.4])),
+        ("pl", OPS.asarray([0.7, 0.8, 0.9])),
     ]
 
 
@@ -171,8 +173,10 @@ def test_vectors_most_similar_identical():
 @pytest.mark.parametrize("text", ["apple and orange"])
 def test_vectors_token_vector(tokenizer_v, vectors, text):
     doc = tokenizer_v(text)
-    assert vectors[0] == (doc[0].text, list(doc[0].vector))
-    assert vectors[1] == (doc[2].text, list(doc[2].vector))
+    assert vectors[0][0] == doc[0].text
+    assert all([a == b for a, b in zip(vectors[0][1], doc[0].vector)])
+    assert vectors[1][0] == doc[2].text
+    assert all([a == b for a, b in zip(vectors[1][1], doc[2].vector)])
 
 
 @pytest.mark.parametrize("text", ["apple"])
@@ -301,7 +305,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
 
 def test_vocab_add_vector():
     vocab = Vocab(vectors_name="test_vocab_add_vector")
-    data = numpy.ndarray((5, 3), dtype="f")
+    data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
     vocab.set_vector("cat", data[0])
@@ -320,10 +324,10 @@ def test_vocab_prune_vectors():
     _ = vocab["cat"]  # noqa: F841
     _ = vocab["dog"]  # noqa: F841
     _ = vocab["kitten"]  # noqa: F841
-    data = numpy.ndarray((5, 3), dtype="f")
-    data[0] = [1.0, 1.2, 1.1]
-    data[1] = [0.3, 1.3, 1.0]
-    data[2] = [0.9, 1.22, 1.05]
+    data = OPS.xp.ndarray((5, 3), dtype="f")
+    data[0] = OPS.asarray([1.0, 1.2, 1.1])
+    data[1] = OPS.asarray([0.3, 1.3, 1.0])
+    data[2] = OPS.asarray([0.9, 1.22, 1.05])
     vocab.set_vector("cat", data[0])
     vocab.set_vector("dog", data[1])
     vocab.set_vector("kitten", data[2])
@@ -332,40 +336,41 @@ def test_vocab_prune_vectors():
     assert list(remap.keys()) == ["kitten"]
     neighbour, similarity = list(remap.values())[0]
     assert neighbour == "cat", remap
-    assert_allclose(similarity, get_cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
+    cosine = get_cosine(data[0], data[2])
+    assert_allclose(float(similarity), cosine, atol=1e-4, rtol=1e-3)
 
 
 def test_vectors_serialize():
-    data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+    data = OPS.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
     v = Vectors(data=data, keys=["A", "B", "C"])
     b = v.to_bytes()
     v_r = Vectors()
     v_r.from_bytes(b)
-    assert_equal(v.data, v_r.data)
+    assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
     assert v.key2row == v_r.key2row
     v.resize((5, 4))
     v_r.resize((5, 4))
-    row = v.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f"))
-    row_r = v_r.add("D", vector=numpy.asarray([1, 2, 3, 4], dtype="f"))
+    row = v.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f"))
+    row_r = v_r.add("D", vector=OPS.asarray([1, 2, 3, 4], dtype="f"))
     assert row == row_r
-    assert_equal(v.data, v_r.data)
+    assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
     assert v.is_full == v_r.is_full
     with make_tempdir() as d:
         v.to_disk(d)
         v_r.from_disk(d)
-        assert_equal(v.data, v_r.data)
+        assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
         assert v.key2row == v_r.key2row
         v.resize((5, 4))
         v_r.resize((5, 4))
-        row = v.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f"))
-        row_r = v_r.add("D", vector=numpy.asarray([10, 20, 30, 40], dtype="f"))
+        row = v.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
+        row_r = v_r.add("D", vector=OPS.asarray([10, 20, 30, 40], dtype="f"))
         assert row == row_r
-        assert_equal(v.data, v_r.data)
+        assert_equal(OPS.to_numpy(v.data), OPS.to_numpy(v_r.data))
 
 
 def test_vector_is_oov():
     vocab = Vocab(vectors_name="test_vocab_is_oov")
-    data = numpy.ndarray((5, 3), dtype="f")
+    data = OPS.xp.ndarray((5, 3), dtype="f")
     data[0] = 1.0
     data[1] = 2.0
     vocab.set_vector("cat", data[0])

From 8a95475b3dce2e52bc9be53a7b8c9ad49d7fc32c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 22 Apr 2021 16:33:26 +0200
Subject: [PATCH 116/146] Set version to v3.0.6 (#7854)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index 2987f3c53..c351076c5 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.5"
+__version__ = "3.0.6"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From df3444421aba611d4ad1238610ce189df158d85a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 23 Apr 2021 12:16:12 +0200
Subject: [PATCH 117/146] Update spacy-legacy to >=3.0.4 (#7865)

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 517553241..1947dd2de 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.3,<3.1.0
+spacy-legacy>=3.0.4,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.3,<8.1.0
diff --git a/setup.cfg b/setup.cfg
index ffdb8b2b8..9e1293335 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     thinc>=8.0.3,<8.1.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.3,<3.1.0
+    spacy-legacy>=3.0.4,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0

From 874cd025395b9bbcfb4ab5991fdf24cc99fd95e1 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Apr 2021 17:06:32 +0200
Subject: [PATCH 118/146] Set spacy-legacy to >=3.0.5 (#7897)

Set `spacy-legacy` to `>=3.0.5` due to `spacy.StaticVectors.v1` init bug.
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1947dd2de..a8a15a01b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.4,<3.1.0
+spacy-legacy>=3.0.5,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.3,<8.1.0
diff --git a/setup.cfg b/setup.cfg
index 9e1293335..2fedd8f5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     thinc>=8.0.3,<8.1.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.4,<3.1.0
+    spacy-legacy>=3.0.5,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0

From 1690595e4d243378dd13542090c658429fd87d15 Mon Sep 17 00:00:00 2001
From: Janis Klaise <janis.klaise@gmail.com>
Date: Tue, 27 Apr 2021 08:13:39 +0100
Subject: [PATCH 119/146] Update load_lookups return type and docstring (#7907)

* Update load_lookups return type and docstring

* Add contributor agreement
---
 .github/contributors/jklaise.md | 106 ++++++++++++++++++++++++++++++++
 spacy/lookups.py                |   8 +--
 2 files changed, 110 insertions(+), 4 deletions(-)
 create mode 100644 .github/contributors/jklaise.md

diff --git a/.github/contributors/jklaise.md b/.github/contributors/jklaise.md
new file mode 100644
index 000000000..66d77ee48
--- /dev/null
+++ b/.github/contributors/jklaise.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           |Janis Klaise          |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           |26/04/2021            |
+| GitHub username                |jklaise               |
+| Website (optional)             |janisklaise.com       |
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 76535d1de..f635f0dcf 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -1,4 +1,4 @@
-from typing import Dict, Any, List, Union, Optional
+from typing import Any, List, Union, Optional
 from pathlib import Path
 import srsly
 from preshed.bloom import BloomFilter
@@ -14,16 +14,16 @@ UNSET = object()
 
 def load_lookups(
     lang: str, tables: List[str], strict: bool = True
-) -> Optional[Dict[str, Any]]:
+) -> 'Lookups':
     """Load the data from the spacy-lookups-data package for a given language,
-    if available. Returns an empty dict if there's no data or if the package
+    if available. Returns an empty `Lookups` container if there's no data or if the package
     is not installed.
 
     lang (str): The language code (corresponds to entry point exposed by
         the spacy-lookups-data package).
     tables (List[str]): Name of tables to load, e.g. ["lemma_lookup", "lemma_exc"]
     strict (bool): Whether to raise an error if a table doesn't exist.
-    RETURNS (Dict[str, Any]): The lookups, keyed by table name.
+    RETURNS (Lookups): The lookups container containing the loaded tables.
     """
     # TODO: import spacy_lookups_data instead of going via entry points here?
     lookups = Lookups()

From de6b5ed14dcb036c02e92664365ea2b1fb6cf21c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Tue, 27 Apr 2021 16:16:35 +0900
Subject: [PATCH 120/146] Fix percent unk display in debug data (#7886)

* Fix percent unk display

This was showing (ratio %), so 10% would show as 0.10%. Fix by
multiplying ration by 100.

Might want to add a warning if this is over a threshold.

* Only show whole-integer percents
---
 spacy/cli/debug_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index 3351e53fe..1ebf65957 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -173,8 +173,8 @@ def debug_data(
         )
         n_missing_vectors = sum(gold_train_data["words_missing_vectors"].values())
         msg.warn(
-            "{} words in training data without vectors ({:0.2f}%)".format(
-                n_missing_vectors, n_missing_vectors / gold_train_data["n_words"]
+            "{} words in training data without vectors ({:.0f}%)".format(
+                n_missing_vectors, 100 * (n_missing_vectors / gold_train_data["n_words"])
             ),
         )
         msg.text(

From 8007d5c8148460d08a6aa500dff0eabb0f504f23 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 28 Apr 2021 16:17:15 +0900
Subject: [PATCH 121/146] Check if the resume path points to a directory
 (#7919)

This came up in #7878, but if --resume-path is a directory then loading
the weights will fail. On Linux this will give a straightforward error
message, but on Windows it gives "Permission Denied", which is
confusing.
---
 spacy/cli/pretrain.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index 1f8fc99cc..fe3ce0dad 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -95,6 +95,13 @@ def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
                 "then the new directory will be created for you.",
             )
     if resume_path is not None:
+        if resume_path.is_dir():
+            # This is necessary because Windows gives a Permission Denied when we
+            # try to open the directory later, which is confusing. See #7878
+            msg.fail(
+                "--resume-path should be a weights file, but {resume_path} is a directory.",
+                exits=True,
+            )
         model_name = re.search(r"model\d+\.bin", str(resume_path))
         if not model_name and not epoch_resume:
             msg.fail(

From f4080983eab96a1c43a98d2553bc2a2cdea3986d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 28 Apr 2021 10:18:24 +0200
Subject: [PATCH 122/146] Extend to cupy 9.0.0 (#7914)

---
 .github/azure-steps.yml |  2 +-
 setup.cfg               | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 750e096d0..d536f2eb8 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -41,7 +41,7 @@ steps:
     displayName: "Install test requirements"
 
   - script: |
-      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110
+      ${{ parameters.prefix }} python -m pip install -U cupy-cuda110 -f https://github.com/cupy/cupy/releases/v9.0.0
       ${{ parameters.prefix }} python -m pip install "torch==1.7.1+cu110" -f https://download.pytorch.org/whl/torch_stable.html
     displayName: "Install GPU requirements"
     condition: eq(${{ parameters.gpu }}, true)
diff --git a/setup.cfg b/setup.cfg
index 2fedd8f5c..63d603a9c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -71,27 +71,27 @@ transformers =
 ray =
     spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<9.0.0
+    cupy>=5.0.0b4,<10.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<9.0.0
+    cupy-cuda80>=5.0.0b4,<10.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<9.0.0
+    cupy-cuda90>=5.0.0b4,<10.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<9.0.0
+    cupy-cuda91>=5.0.0b4,<10.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<9.0.0
+    cupy-cuda92>=5.0.0b4,<10.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<9.0.0
+    cupy-cuda100>=5.0.0b4,<10.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<9.0.0
+    cupy-cuda101>=5.0.0b4,<10.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<9.0.0
+    cupy-cuda102>=5.0.0b4,<10.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<9.0.0
+    cupy-cuda110>=5.0.0b4,<10.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<9.0.0
+    cupy-cuda111>=5.0.0b4,<10.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<9.0.0
+    cupy-cuda112>=5.0.0b4,<10.0.0
 # Language tokenizers with external dependencies
 ja =
     sudachipy>=0.4.9

From 49aed683cce4d58baca10e7cb4fe89fbfc209a36 Mon Sep 17 00:00:00 2001
From: Sevdimali <sevdimaliisa@gmail.com>
Date: Wed, 28 Apr 2021 16:42:02 +0400
Subject: [PATCH 123/146] Azerbaijani language added (#7911)

---
 .github/contributors/sevdimali.md | 106 ++++++++++++++++++++++
 spacy/lang/az/__init__.py         |  21 +++++
 spacy/lang/az/examples.py         |  18 ++++
 spacy/lang/az/lex_attrs.py        |  89 ++++++++++++++++++
 spacy/lang/az/stop_words.py       | 145 ++++++++++++++++++++++++++++++
 5 files changed, 379 insertions(+)
 create mode 100644 .github/contributors/sevdimali.md
 create mode 100644 spacy/lang/az/__init__.py
 create mode 100644 spacy/lang/az/examples.py
 create mode 100644 spacy/lang/az/lex_attrs.py
 create mode 100644 spacy/lang/az/stop_words.py

diff --git a/.github/contributors/sevdimali.md b/.github/contributors/sevdimali.md
new file mode 100644
index 000000000..6b96abdf8
--- /dev/null
+++ b/.github/contributors/sevdimali.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Sevdimali            |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 10/4/2021            |
+| GitHub username                | sevdimali            |
+| Website (optional)             | https://sevdimali.me |
diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
new file mode 100644
index 000000000..6a4288d1e
--- /dev/null
+++ b/spacy/lang/az/__init__.py
@@ -0,0 +1,21 @@
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
+from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
+from .lex_attrs import LEX_ATTRS
+from ...language import Language
+
+
+class AzerbaijaniDefaults(Language.Defaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
+    lex_attr_getters = LEX_ATTRS
+    stop_words = STOP_WORDS
+    token_match = TOKEN_MATCH
+    syntax_iterators = SYNTAX_ITERATORS
+
+
+class Azerbaijani(Language):
+    lang = "az"
+    Defaults = AzerbaijaniDefaults
+
+
+__all__ = ["Azerbaijani"]
diff --git a/spacy/lang/az/examples.py b/spacy/lang/az/examples.py
new file mode 100644
index 000000000..f3331a8cb
--- /dev/null
+++ b/spacy/lang/az/examples.py
@@ -0,0 +1,18 @@
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.az.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Bu bir cümlədir.",
+    "Necəsən?",
+    "Qarabağ ordeni vətən müharibəsində qələbə münasibəti ilə təsis edilmişdir.",
+    "Məktəbimizə Bakıdan bir tarix müəllimi gəlmişdi.",
+    "Atılan növbəti mərmilər lap yaxınlıqda partladı.",
+    "Sinqapur koronavirus baxımından ən təhlükəsiz ölkələr sırasındadır.",
+    "Marsda ilk sınaq uçuşu həyata keçirilib.",
+    "SSRİ dağılandan bəri 5 sahil dövləti Xəzərin statusunu müəyyən edə bilməyiblər.",
+    "Videoda beyninə xüsusi çip yerləşdirilmiş meymun əks olunub.",
+]
diff --git a/spacy/lang/az/lex_attrs.py b/spacy/lang/az/lex_attrs.py
new file mode 100644
index 000000000..73a5e2762
--- /dev/null
+++ b/spacy/lang/az/lex_attrs.py
@@ -0,0 +1,89 @@
+from ...attrs import LIKE_NUM
+
+
+# Eleven, twelve etc. are written separate: on bir, on iki
+
+_num_words = [
+    "bir",
+    "iki",
+    "üç",
+    "dörd",
+    "beş",
+    "altı",
+    "yeddi",
+    "səkkiz",
+    "doqquz",
+    "on",
+    "iyirmi",
+    "otuz",
+    "qırx",
+    "əlli",
+    "altmış",
+    "yetmiş",
+    "səksən",
+    "doxsan",
+    "yüz",
+    "min",
+    "milyon",
+    "milyard",
+    "trilyon",
+    "kvadrilyon",
+    "kentilyon",
+]
+
+
+_ordinal_words = [
+    "birinci",
+    "ikinci",
+    "üçüncü",
+    "dördüncü",
+    "beşinci",
+    "altıncı",
+    "yedinci",
+    "səkkizinci",
+    "doqquzuncu",
+    "onuncu",
+    "iyirminci",
+    "otuzuncu",
+    "qırxıncı",
+    "əllinci",
+    "altmışıncı",
+    "yetmişinci",
+    "səksəninci",
+    "doxsanıncı",
+    "yüzüncü",
+    "mininci",
+    "milyonuncu",
+    "milyardıncı",
+    "trilyonuncu",
+    "kvadrilyonuncu",
+    "kentilyonuncu",
+]
+
+_ordinal_endings = ("inci", "ıncı", "nci", "ncı", "uncu", "üncü")
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    # Check cardinal number
+    if text_lower in _num_words:
+        return True
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    if text_lower.endswith(_ordinal_endings):
+        if text_lower[:-3].isdigit() or text_lower[:-4].isdigit():
+            return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/az/stop_words.py b/spacy/lang/az/stop_words.py
new file mode 100644
index 000000000..2114939ba
--- /dev/null
+++ b/spacy/lang/az/stop_words.py
@@ -0,0 +1,145 @@
+# Source: https://github.com/eliasdabbas/advertools/blob/master/advertools/stopwords.py
+STOP_WORDS = set(
+    """
+amma
+arasında
+artıq
+ay
+az
+bax
+belə
+beş
+bilər
+bir
+biraz
+biri
+birşey
+biz
+bizim
+bizlər
+bu
+buna
+bundan
+bunların
+bunu
+bunun
+buradan
+bütün
+bəli
+bəlkə
+bəy
+bəzi
+bəzən
+daha
+dedi
+deyil
+dir
+düz
+də
+dək
+dən
+dəqiqə
+edir
+edən
+elə
+et
+etdi
+etmə
+etmək
+faiz
+gilə
+görə
+ha
+haqqında
+harada
+heç
+hə
+həm
+həmin
+həmişə
+hər
+idi
+il
+ildə
+ilk
+ilə
+in
+indi
+istifadə
+isə
+ki
+kim
+kimi
+kimə
+lakin
+lap
+mirşey
+məhz
+mən
+mənə
+niyə
+nə
+nəhayət
+o
+obirisi
+of
+olan
+olar
+olaraq
+oldu
+olduğu
+olmadı
+olmaz
+olmuşdur
+olsun
+olur
+on
+ona
+ondan
+onlar
+onlardan
+onların
+onsuzda
+onu
+onun
+oradan
+qarşı
+qədər
+saat
+sadəcə
+saniyə
+siz
+sizin
+sizlər
+sonra
+səhv
+sən
+sənin
+sənə
+təəssüf
+var
+və
+xan
+xanım
+xeyr
+ya
+yalnız
+yaxşı
+yeddi
+yenə
+yox
+yoxdur
+yoxsa
+yəni
+zaman
+çox
+çünki
+öz
+özü
+üçün
+əgər
+əlbəttə
+ən
+əslində
+""".split()
+)

From 7cf5bd072fc1ca65be2a9eb3115aa838ba83b04d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 29 Apr 2021 16:58:54 +0200
Subject: [PATCH 124/146] Refactor util.to_ternary_int (#7944)

* Refactor to avoid literal comparison with `is`
* Extend tests
---
 spacy/tests/test_misc.py | 16 ++++++++++++++++
 spacy/util.py            | 12 ++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 0d09999a9..b38a50f71 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -8,6 +8,7 @@ from spacy import prefer_gpu, require_gpu, require_cpu
 from spacy.ml._precomputable_affine import PrecomputableAffine
 from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 from spacy.util import dot_to_object, SimpleFrozenList, import_file
+from spacy.util import to_ternary_int
 from thinc.api import Config, Optimizer, ConfigValidationError, get_current_ops
 from thinc.api import set_current_ops
 from spacy.training.batchers import minibatch_by_words
@@ -386,3 +387,18 @@ def make_dummy_component(
         nlp = English.from_config(config)
         nlp.add_pipe("dummy_component")
         nlp.initialize()
+
+
+def test_to_ternary_int():
+    assert to_ternary_int(True) == 1
+    assert to_ternary_int(None) == 0
+    assert to_ternary_int(False) == -1
+    assert to_ternary_int(1) == 1
+    assert to_ternary_int(1.0) == 1
+    assert to_ternary_int(0) == 0
+    assert to_ternary_int(0.0) == 0
+    assert to_ternary_int(-1) == -1
+    assert to_ternary_int(5) == -1
+    assert to_ternary_int(-10) == -1
+    assert to_ternary_int("string") == -1
+    assert to_ternary_int([0, "string"]) == -1
diff --git a/spacy/util.py b/spacy/util.py
index 512c6b742..84142d5d8 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1533,11 +1533,15 @@ def to_ternary_int(val) -> int:
     attributes such as SENT_START: True/1/1.0 is 1 (True), None/0/0.0 is 0
     (None), any other values are -1 (False).
     """
-    if isinstance(val, float):
-        val = int(val)
-    if val is True or val is 1:
+    if val is True:
         return 1
-    elif val is None or val is 0:
+    elif val is None:
+        return 0
+    elif val is False:
+        return -1
+    elif val == 1:
+        return 1
+    elif val == 0:
         return 0
     else:
         return -1

From cf032ec31e38f57940edfb93f041bcd373871554 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 29 Apr 2021 19:11:28 +0200
Subject: [PATCH 125/146] Update to catalogue>=2.0.4 (#7951)

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a8a15a01b..09d1cabda 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.8.1,<1.1.0
 srsly>=2.4.1,<3.0.0
-catalogue>=2.0.3,<2.1.0
+catalogue>=2.0.4,<2.1.0
 typer>=0.3.0,<0.4.0
 pathy>=0.3.5
 # Third party dependencies
diff --git a/setup.cfg b/setup.cfg
index 63d603a9c..5cda00fb2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,7 +45,7 @@ install_requires =
     blis>=0.4.0,<0.8.0
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.1,<3.0.0
-    catalogue>=2.0.3,<2.1.0
+    catalogue>=2.0.4,<2.1.0
     typer>=0.3.0,<0.4.0
     pathy>=0.3.5
     # Third-party dependencies

From 2320791f6dc42f7724cedc86a420572c90aa7a5c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 30 Apr 2021 12:21:31 +0200
Subject: [PATCH 126/146] Fix Transformer.initialize example (#7963)

---
 website/docs/api/transformer.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 5aaa1d23e..6de2b0a87 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -175,7 +175,7 @@ by [`Language.initialize`](/api/language#initialize).
 >
 > ```python
 > trf = nlp.add_pipe("transformer")
-> trf.initialize(lambda: [], nlp=nlp)
+> trf.initialize(lambda: iter([]), nlp=nlp)
 > ```
 
 | Name           | Description                                                                                                                           |

From 12d3d0feddc4f813d1cc63ab2465e31e9c8816cc Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 3 May 2021 11:48:12 +1000
Subject: [PATCH 127/146] Fix quickstart default checked of conditional fields
 [ci skip]

---
 website/src/components/quickstart.js       | 3 ++-
 website/src/widgets/quickstart-training.js | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js
index 90a8e0983..a32db8975 100644
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@@ -105,12 +105,13 @@ const Quickstart = ({
                         multiple,
                         other,
                         help,
+                        hidden,
                     }) => {
                         // Optional function that's called with the value
                         const setterFunc = setters[id] || (() => {})
                         // Check if dropdown should be shown
                         const dropdownGetter = showDropdown[id] || (() => true)
-                        return (
+                        return hidden ? null : (
                             <div key={id} data-quickstart-group={id} className={classes.group}>
                                 <style data-quickstart-style={id} scoped>
                                     {styles[id] ||
diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js
index 849c80f3d..ad29c324f 100644
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@@ -115,7 +115,9 @@ export default function QuickstartTraining({ id, title, download = 'base_config.
                     }))
                     .sort((a, b) => a.title.localeCompare(b.title))
                 if (!_components.includes('textcat')) {
-                    data = data.filter(({ id }) => id !== 'textcat')
+                    data = data.map(field =>
+                        field.id === 'textcat' ? { ...field, hidden: true } : field
+                    )
                 }
                 return (
                     <Quickstart

From e99ff6f2554f915c8c3543659f89d8b2c42496a0 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Mon, 3 May 2021 08:44:09 -0400
Subject: [PATCH 128/146] Fix typo in Language docstrings (#7958)

---
 spacy/language.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 6f6470533..95a902380 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -433,9 +433,9 @@ class Language:
         default_config (Dict[str, Any]): Default configuration, describing the
             default values of the factory arguments.
         assigns (Iterable[str]): Doc/Token attributes assigned by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
         requires (Iterable[str]): Doc/Token attributes required by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
         retokenizes (bool): Whether the component changes the tokenization.
             Used for pipeline analysis.
         default_score_weights (Dict[str, float]): The scores to report during
@@ -518,9 +518,9 @@ class Language:
 
         name (str): The name of the component factory.
         assigns (Iterable[str]): Doc/Token attributes assigned by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
         requires (Iterable[str]): Doc/Token attributes required by this component,
-            e.g. "token.ent_id". Used for pipeline analyis.
+            e.g. "token.ent_id". Used for pipeline analysis.
         retokenizes (bool): Whether the component changes the tokenization.
             Used for pipeline analysis.
         func (Optional[Callable]): Factory function if not used as a decorator.

From 31528f62edf749b0014a10456fbe2a48b324b581 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 4 May 2021 11:00:10 +0200
Subject: [PATCH 129/146] Add / to nb infixes (#7991)

---
 spacy/lang/nb/punctuation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py
index 9b800029c..8f2933670 100644
--- a/spacy/lang/nb/punctuation.py
+++ b/spacy/lang/nb/punctuation.py
@@ -27,7 +27,7 @@ _infixes = (
     + [
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),

From debaab702111f237f2c2ecf985f9d1a5e0b8838e Mon Sep 17 00:00:00 2001
From: meghanabhange <meghanabhange13@gmail.com>
Date: Wed, 5 May 2021 20:42:13 +0530
Subject: [PATCH 130/146] Update details in universe denomme | Multilingual
 Name Detection (#7982)

* Add denomme

* spaCy contributor agreement

* Update install and thumb

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/meta/universe.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index a0183c15d..0fcc1379f 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -6,7 +6,7 @@
             "slogan": "Multilingual Name Detection",
             "description": "A SpaCy extension for Spans to extract multilingual names out of documents trained on XLM-roberta backbone",
             "github": "meghanabhange/denomme",
-            "pip": "denomme",
+            "pip": "denomme https://denomme.s3.us-east-2.amazonaws.com/xx_denomme-0.3.1/dist/xx_denomme-0.3.1.tar.gz",
             "code_example": [
                 "from spacy.lang.xx import MultiLanguage",
                 "from denomme.name import person_name_component",
@@ -16,6 +16,8 @@
                 "print(doc._.person_name)",
                 "# ['Meghana S.R Bhange', 'Asha']"
             ],
+            "thumb":"https://i.ibb.co/jwGVWPZ/rainbow-bohemian-logo-removebg-preview.png",
+            "code_language": "python",
             "author": "Meghana Bhange",
             "author_links": {
             "github": "meghanabhange",

From a71194362f4a1f40e158ae216f9e3f852347f53e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 5 May 2021 18:44:14 +0200
Subject: [PATCH 131/146] Fix Docs.from_docs for all empty docs (#8009)

---
 spacy/tests/doc/test_doc_api.py |  3 +++
 spacy/tokens/doc.pyx            | 11 ++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index d7452a802..3aae063d3 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -411,6 +411,9 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
 
+    # can merge empty docs
+    doc = Doc.from_docs([en_tokenizer("")] * 10)
+
 
 def test_doc_api_from_docs_ents(en_tokenizer):
     texts = ["Merging the docs is fun.", "They don't think alike."]
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index aae0ff374..4858ad9dd 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1158,11 +1158,12 @@ cdef class Doc:
             for i, array in enumerate(arrays[:-1]):
                 if len(array) > 0 and not docs[i][-1].is_space:
                     array[-1][spacy_index] = 1
-            token_offset = -1
-            for doc in docs[:-1]:
-                token_offset += len(doc)
-                if not (len(doc) > 0 and doc[-1].is_space):
-                    concat_spaces[token_offset] = True
+            if len(concat_spaces) > 0:
+                token_offset = -1
+                for doc in docs[:-1]:
+                    token_offset += len(doc)
+                    if not (len(doc) > 0 and doc[-1].is_space):
+                        concat_spaces[token_offset] = True
 
         concat_array = numpy.concatenate(arrays)
 

From 66bfabd839b85b4afc2bb0c0036d1e92c5b3e5c7 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 6 May 2021 15:27:36 +0900
Subject: [PATCH 132/146] Fix pretraining objectives fragment (#8005)

* Fix pretraining objectives fragment

The fragment here is reused from a heading higher up, so you couldn't
link to this section.

* Fix section link to new fragment
---
 website/docs/usage/embeddings-transformers.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md
index 4113e9394..9e3f140e4 100644
--- a/website/docs/usage/embeddings-transformers.md
+++ b/website/docs/usage/embeddings-transformers.md
@@ -678,7 +678,7 @@ The following defaults are used for the `[pretraining]` block and merged into
 your existing config when you run [`init config`](/api/cli#init-config) or
 [`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
 you can [configure](#pretraining-configure) the settings and hyperparameters or
-change the [objective](#pretraining-details).
+change the [objective](#pretraining-objectives).
 
 ```ini
 %%GITHUB_SPACY/spacy/default_config_pretraining.cfg
@@ -732,7 +732,7 @@ component = "textcat"
 layer = "tok2vec"
 ```
 
-#### Pretraining objectives {#pretraining-details}
+#### Pretraining objectives {#pretraining-objectives}
 
 > ```ini
 > ### Characters objective

From e9037d8fc0d001087c4d0463ee876af06349d028 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 6 May 2021 10:14:47 +0200
Subject: [PATCH 133/146] make EntityLinker robust for nO=None (#7930)

---
 spacy/ml/models/entity_linker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py
index 21e1c53b9..99bb85f32 100644
--- a/spacy/ml/models/entity_linker.py
+++ b/spacy/ml/models/entity_linker.py
@@ -11,7 +11,7 @@ from ...vocab import Vocab
 @registry.architectures("spacy.EntityLinker.v1")
 def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
     with Model.define_operators({">>": chain, "**": clone}):
-        token_width = tok2vec.get_dim("nO")
+        token_width = tok2vec.maybe_get_dim("nO")
         output_layer = Linear(nO=nO, nI=token_width)
         model = (
             tok2vec

From 7d5db41ac34f719836ae0363ea596e7e6380fcd9 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 May 2021 10:34:35 +0200
Subject: [PATCH 134/146] Skip vector ngram backoff if minn is not set (#7925)

---
 spacy/vocab.pyx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index ee440898a..13dd675af 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -364,15 +364,15 @@ cdef class Vocab:
         word = self[orth].orth_
         if orth in self.vectors.key2row:
             return self.vectors[orth]
-        # Assign default ngram limits to minn and maxn which is the length of the word.
-        if minn is None:
-            minn = len(word)
-        if maxn is None:
-            maxn = len(word)
         xp = get_array_module(self.vectors.data)
         vectors = xp.zeros((self.vectors_length,), dtype="f")
+        if minn is None:
+            return vectors
         # Fasttext's ngram computation taken from
         # https://github.com/facebookresearch/fastText
+        # Assign default ngram limit to maxn which is the length of the word.
+        if maxn is None:
+            maxn = len(word)
         ngrams_size = 0;
         for i in range(len(word)):
             ngram = ""

From 0a22fed6341f1e26500b5d5420b46b96197b933e Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 May 2021 10:42:44 +0200
Subject: [PATCH 135/146] Fix span offsets for Matcher(as_spans) on spans
 (#7992)

Fix returned span offsets for `Matcher(as_spans=True)(span)`.
---
 spacy/matcher/matcher.pyx               | 8 +++++++-
 spacy/tests/matcher/test_matcher_api.py | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index dae12c3f6..f389b4abd 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -284,7 +284,13 @@ cdef class Matcher:
             if on_match is not None:
                 on_match(self, doc, i, final_matches)
         if as_spans:
-            return [Span(doc, start, end, label=key) for key, start, end in final_matches]
+            spans = []
+            for key, start, end in final_matches:
+                if isinstance(doclike, Span):
+                    start += doclike.start
+                    end += doclike.start
+                spans.append(Span(doc, start, end, label=key))
+            return spans
         elif with_alignments:
             # convert alignments List[Dict[str, int]] --> List[int]
             final_matches = []
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 094bf22a6..548da7dc6 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -513,6 +513,12 @@ def test_matcher_as_spans(matcher):
     assert matches[1].text == "Java"
     assert matches[1].label_ == "Java"
 
+    matches = matcher(doc[1:], as_spans=True)
+    assert len(matches) == 1
+    assert isinstance(matches[0], Span)
+    assert matches[0].text == "Java"
+    assert matches[0].label_ == "Java"
+
 
 def test_matcher_deprecated(matcher):
     doc = Doc(matcher.vocab, words=["hello", "world"])

From cc5aeaed29c067f60d11e07496704406a1577a35 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 May 2021 10:43:03 +0200
Subject: [PATCH 136/146] Add Chinese PTB tags to glossary (#7993)

---
 spacy/glossary.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/spacy/glossary.py b/spacy/glossary.py
index c4a6a5c45..0dc075ca7 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -58,7 +58,7 @@ GLOSSARY = {
     "FW": "foreign word",
     "HYPH": "punctuation mark, hyphen",
     "IN": "conjunction, subordinating or preposition",
-    "JJ": "adjective",
+    "JJ": "adjective (English), other noun-modifier (Chinese)",
     "JJR": "adjective, comparative",
     "JJS": "adjective, superlative",
     "LS": "list item marker",
@@ -88,7 +88,7 @@ GLOSSARY = {
     "WP": "wh-pronoun, personal",
     "WP$": "wh-pronoun, possessive",
     "WRB": "wh-adverb",
-    "SP": "space",
+    "SP": "space (English), sentence-final particle (Chinese)",
     "ADD": "email",
     "NFP": "superfluous punctuation",
     "GW": "additional word in multi-word expression",
@@ -152,6 +152,40 @@ GLOSSARY = {
     "VVIZU": 'infinitive with "zu", full',
     "VVPP": "perfect participle, full",
     "XY": "non-word containing non-letter",
+    # POS Tags (Chinese)
+    # OntoNotes / Chinese Penn Treebank
+    # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
+    "AD": "adverb",
+    "AS": "aspect marker",
+    "BA": "把 in ba-construction",
+    # "CD": "cardinal number",
+    "CS": "subordinating conjunction",
+    "DEC": "的 in a relative clause",
+    "DEG": "associative 的",
+    "DER": "得 in V-de const. and V-de-R",
+    "DEV": "地 before VP",
+    "ETC": "for words 等, 等等",
+    # "FW": "foreign words"
+    "IJ": "interjection",
+    # "JJ": "other noun-modifier",
+    "LB": "被 in long bei-const",
+    "LC": "localizer",
+    "M": "measure word",
+    "MSP": "other particle",
+    # "NN": "common noun",
+    "NR": "proper noun",
+    "NT": "temporal noun",
+    "OD": "ordinal number",
+    "ON": "onomatopoeia",
+    "P": "preposition excluding 把 and 被",
+    "PN": "pronoun",
+    "PU": "punctuation",
+    "SB": "被 in short bei-const",
+    # "SP": "sentence-final particle",
+    "VA": "predicative adjective",
+    "VC": "是 (copula)",
+    "VE": "有 as the main verb",
+    "VV": "other verb",
     # Noun chunks
     "NP": "noun phrase",
     "PP": "prepositional phrase",

From 02a6a5fea07bbd7703130639e097c954985ae532 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 6 May 2021 10:43:32 +0200
Subject: [PATCH 137/146] Fix 'debug model' for transformers + generalize
 (#7973)

* add overrides to docs

* fix debug model with transformer

* assume training data is set in config
---
 spacy/cli/debug_model.py | 68 +++++++++++++++-------------------------
 website/docs/api/cli.md  |  1 +
 2 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 54c09c850..015e3a76b 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -1,5 +1,6 @@
 from typing import Dict, Any, Optional, Iterable
 from pathlib import Path
+import itertools
 
 from spacy.training import Example
 from spacy.util import resolve_dot_names
@@ -73,23 +74,24 @@ def debug_model_cli(
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
     pipe = nlp.get_pipe(component)
-    if not hasattr(pipe, "model"):
-        msg.fail(
-            f"The component '{component}' does not specify an object that holds a Model.",
-            exits=1,
-        )
-    model = pipe.model
-    debug_model(config, T, nlp, model, print_settings=print_settings)
+
+    debug_model(config, T, nlp, pipe, print_settings=print_settings)
 
 
 def debug_model(
     config,
     resolved_train_config,
     nlp,
-    model: Model,
+    pipe,
     *,
     print_settings: Optional[Dict[str, Any]] = None,
 ):
+    if not hasattr(pipe, "model"):
+        msg.fail(
+            f"The component '{pipe}' does not specify an object that holds a Model.",
+            exits=1,
+        )
+    model = pipe.model
     if not isinstance(model, Model):
         msg.fail(
             f"Requires a Thinc Model to be analysed, but found {type(model)} instead.",
@@ -105,8 +107,6 @@ def debug_model(
         _print_model(model, print_settings)
 
     # STEP 1: Initializing the model and printing again
-    X = _get_docs()
-    # The output vector might differ from the official type of the output layer
     with data_validation(False):
         try:
             dot_names = [resolved_train_config["train_corpus"]]
@@ -114,15 +114,17 @@ def debug_model(
                 (train_corpus,) = resolve_dot_names(config, dot_names)
                 nlp.initialize(lambda: train_corpus(nlp))
             msg.info("Initialized the model with the training corpus.")
+            examples = list(itertools.islice(train_corpus(nlp), 5))
         except ValueError:
             try:
                 _set_output_dim(nO=7, model=model)
                 with show_validation_error():
-                    nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X])
+                    examples = [Example.from_dict(x, {}) for x in _get_docs()]
+                    nlp.initialize(lambda: examples)
                 msg.info("Initialized the model with dummy data.")
             except Exception:
                 msg.fail(
-                    "Could not initialize the model: you'll have to provide a valid train_corpus argument in the config file.",
+                    "Could not initialize the model: you'll have to provide a valid 'train_corpus' argument in the config file.",
                     exits=1,
                 )
 
@@ -133,26 +135,23 @@ def debug_model(
     # STEP 2: Updating the model and printing again
     optimizer = Adam(0.001)
     set_dropout_rate(model, 0.2)
-    # ugly hack to deal with Tok2Vec listeners
-    tok2vec = None
-    if model.has_ref("tok2vec") and model.get_ref("tok2vec").name == "tok2vec-listener":
-        tok2vec = nlp.get_pipe("tok2vec")
+    # ugly hack to deal with Tok2Vec/Transformer listeners
+    upstream_component = None
+    if model.has_ref("tok2vec") and "tok2vec-listener" in model.get_ref("tok2vec").name:
+        upstream_component = nlp.get_pipe("tok2vec")
+    if model.has_ref("tok2vec") and "transformer-listener" in model.get_ref("tok2vec").name:
+        upstream_component = nlp.get_pipe("transformer")
     goldY = None
     for e in range(3):
-        if tok2vec:
-            tok2vec.update([Example.from_dict(x, {}) for x in X])
-        Y, get_dX = model.begin_update(X)
-        if goldY is None:
-            goldY = _simulate_gold(Y)
-        dY = get_gradient(goldY, Y, model.ops)
-        get_dX(dY)
-        model.finish_update(optimizer)
+        if upstream_component:
+            upstream_component.update(examples)
+        pipe.update(examples)
     if print_settings.get("print_after_training"):
         msg.divider(f"STEP 2 - after training")
         _print_model(model, print_settings)
 
     # STEP 3: the final prediction
-    prediction = model.predict(X)
+    prediction = model.predict([ex.predicted for ex in examples])
     if print_settings.get("print_prediction"):
         msg.divider(f"STEP 3 - prediction")
         msg.info(str(prediction))
@@ -160,19 +159,6 @@ def debug_model(
     msg.good(f"Succesfully ended analysis - model looks good.")
 
 
-def get_gradient(goldY, Y, ops):
-    return ops.asarray(Y) - ops.asarray(goldY)
-
-
-def _simulate_gold(element, counter=1):
-    if isinstance(element, Iterable):
-        for i in range(len(element)):
-            element[i] = _simulate_gold(element[i], counter + i)
-        return element
-    else:
-        return 1 / counter
-
-
 def _sentences():
     return [
         "Apple is looking at buying U.K. startup for $1 billion",
@@ -209,11 +195,7 @@ def _print_model(model, print_settings):
 
             if dimensions:
                 for name in node.dim_names:
-                    if node.has_dim(name):
-                        msg.info(f" - dim {name}: {node.get_dim(name)}")
-                    else:
-                        msg.info(f" - dim {name}: {node.has_dim(name)}")
-
+                    msg.info(f" - dim {name}: {node.maybe_get_dim(name)}")
             if parameters:
                 for name in node.param_names:
                     if node.has_param(name):
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 196e47543..685f998ff 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -768,6 +768,7 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
 | `--print-step3`, `-P3`  | Print final predictions. ~~bool (flag)~~                                                                                                                                                                           |
 | `--gpu-id`, `-g`        | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~                                                                                                                                                         |
 | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                         |
+| overrides               | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **PRINTS**              | Debugging information.                                                                                                                                                                                             |
 
 ## train {#train tag="command"}

From 6788d90f61a1071c150ee73bc66efaf41a4e8da0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 6 May 2021 10:49:55 +0200
Subject: [PATCH 138/146] Preserve existing ENT_KB_ID annotation in NER (#7988)

* Preserve existing ENT_KB_ID annotation in NER

Preserve `ent_kb_id` annotation on existing entity spans, which is not
preserved by the transition system.

* Simplify kb_id assignment

* Simplify further
---
 spacy/pipeline/_parser_internals/ner.pyx |  2 +-
 spacy/tests/parser/test_ner.py           | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
index dd747c08e..4b0d07725 100644
--- a/spacy/pipeline/_parser_internals/ner.pyx
+++ b/spacy/pipeline/_parser_internals/ner.pyx
@@ -247,7 +247,7 @@ cdef class BiluoPushDown(TransitionSystem):
         for i in range(state.c._ents.size()):
             ent = state.c._ents.at(i)
             if ent.start != -1 and ent.end != -1:
-                ents.append(Span(doc, ent.start, ent.end, label=ent.label))
+                ents.append(Span(doc, ent.start, ent.end, label=ent.label, kb_id=doc.c[ent.start].ent_kb_id))
         doc.set_ents(ents, default="unmodified")
         # Set non-blocked tokens to O
         for i in range(doc.length):
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index dffdff1ec..bebadf7e9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -8,7 +8,7 @@ from spacy.language import Language
 from spacy.lookups import Lookups
 from spacy.pipeline._parser_internals.ner import BiluoPushDown
 from spacy.training import Example
-from spacy.tokens import Doc
+from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
 import logging
 
@@ -358,6 +358,26 @@ def test_overfitting_IO(use_upper):
     assert_equal(batch_deps_1, batch_deps_2)
     assert_equal(batch_deps_1, no_batch_deps)
 
+    # test that kb_id is preserved
+    test_text = "I like London and London."
+    doc = nlp.make_doc(test_text)
+    doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)]
+    ents = doc.ents
+    assert len(ents) == 1
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+    assert ents[0].kb_id == 1234
+    doc = nlp.get_pipe("ner")(doc)
+    ents = doc.ents
+    assert len(ents) == 2
+    assert ents[0].text == "London"
+    assert ents[0].label_ == "LOC"
+    assert ents[0].kb_id == 1234
+    # ent added by ner has kb_id == 0
+    assert ents[1].text == "London"
+    assert ents[1].label_ == "LOC"
+    assert ents[1].kb_id == 0
+
 
 def test_beam_ner_scores():
     # Test that we can get confidence values out of the beam_ner pipe

From 5cf76ab60821ec3fb7165070a190976ef7e9eac5 Mon Sep 17 00:00:00 2001
From: Jeno Pizarro <jenojp@users.noreply.github.com>
Date: Fri, 7 May 2021 03:33:21 -0400
Subject: [PATCH 139/146] Update negspacy example code for spaCy 3.0 (#8022)

---
 website/meta/universe.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 0fcc1379f..721de1eb5 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2375,8 +2375,7 @@
                 "from negspacy.negation import Negex",
                 "",
                 "nlp = spacy.load(\"en_core_web_sm\")",
-                "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])",
-                "nlp.add_pipe(negex, last=True)",
+                "nlp.add_pipe(\"negex\", config={\"ent_types\":[\"PERSON\",\"ORG\"]})",
                 "",
                 "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")",
                 "for e in doc.ents:",

From 71c2a3ab4747e466e77a0a590c9fed4d3b791f29 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 7 May 2021 09:55:20 +0200
Subject: [PATCH 140/146] Fix new version for match_alignments (#8021)

---
 website/docs/api/matcher.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index c15ee7a47..c0e169799 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -126,7 +126,7 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 | _keyword-only_                                 |                                                                                                                                                                                                                                                                                                          |
 | `as_spans` <Tag variant="new">3</Tag>          | Instead of tuples, return a list of [`Span`](/api/span) objects of the matches, with the `match_id` assigned as the span label. Defaults to `False`. ~~bool~~                                                                                                                                            |
 | `allow_missing` <Tag variant="new">3</Tag>     | Whether to skip checks for missing annotation for attributes included in patterns. Defaults to `False`. ~~bool~~                                                                                                                                                                                         |
-| `with_alignments` <Tag variant="new">3.1</Tag> | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~                             |
+| `with_alignments` <Tag variant="new">3.0.6</Tag> | Return match alignment information as part of the match tuple as `List[int]` with the same length as the matched span. Each entry denotes the corresponding index of the token pattern. If `as_spans` is set to `True`, this setting is ignored. Defaults to `False`. ~~bool~~                             |
 | **RETURNS**                                    | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. If `as_spans` is set to `True`, a list of `Span` objects is returned instead. ~~Union[List[Tuple[int, int, int]], List[Span]]~~ |
 
 ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}

From bdeaf3a18b1c62b41425ac8d2be0dd99e418e805 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Fri, 7 May 2021 17:26:42 +0900
Subject: [PATCH 141/146] Fix/fix en ordinals (#8028)

* Fix #8019

"th" is not the only ordinal ending.

* Add some more ordinal tests
---
 spacy/lang/en/lex_attrs.py       | 2 +-
 spacy/tests/lang/en/test_text.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py
index fcc7c6bf2..b630a317d 100644
--- a/spacy/lang/en/lex_attrs.py
+++ b/spacy/lang/en/lex_attrs.py
@@ -35,7 +35,7 @@ def like_num(text: str) -> bool:
     # Check ordinal number
     if text_lower in _ordinal_words:
         return True
-    if text_lower.endswith("th"):
+    if text_lower.endswith(("st", "nd", "rd", "th")):
         if text_lower[:-2].isdigit():
             return True
     return False
diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py
index 733e814f7..358f4c0f9 100644
--- a/spacy/tests/lang/en/test_text.py
+++ b/spacy/tests/lang/en/test_text.py
@@ -56,7 +56,9 @@ def test_lex_attrs_like_number(en_tokenizer, text, match):
     assert tokens[0].like_num == match
 
 
-@pytest.mark.parametrize("word", ["third", "Millionth", "100th", "Hundredth"])
+@pytest.mark.parametrize(
+    "word", ["third", "Millionth", "100th", "Hundredth", "23rd", "52nd"]
+)
 def test_en_lex_attrs_like_number_for_ordinal(word):
     assert like_num(word)
 

From 3883d494460c6f52793cad4378f947ae7a127372 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 11 May 2021 11:27:08 +1000
Subject: [PATCH 142/146] Fix default transformer in quickstart generator
 (resolves #8018) [ci skip]

---
 website/src/widgets/quickstart-training.js | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/website/src/widgets/quickstart-training.js b/website/src/widgets/quickstart-training.js
index ad29c324f..8481d2048 100644
--- a/website/src/widgets/quickstart-training.js
+++ b/website/src/widgets/quickstart-training.js
@@ -86,13 +86,14 @@ export default function QuickstartTraining({ id, title, download = 'base_config.
             updateComponents(_components, isExclusive)
         },
     }
-    const reco = GENERATOR_DATA[lang] || GENERATOR_DATA.__default__
+    const defaultData = GENERATOR_DATA.__default__
+    const reco = GENERATOR_DATA[lang] || defaultData
     const content = generator({
         lang,
         components,
         optimize,
         hardware,
-        transformer_data: reco.transformer,
+        transformer_data: reco.transformer || defaultData.transformer,
         word_vectors: reco.word_vectors,
         has_letters: reco.has_letters,
     })

From d5bbd1f94fd6e89416d83fbd3f28f4d7c8f2a0fb Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 11 May 2021 17:10:16 +0200
Subject: [PATCH 143/146] Handle partial entities in Span.as_doc (#8055)

* Handle partial entities in Span.as_doc

In `Span.as_doc` replace partial entities at the beginning or end of the
span with missing entity annotation.

Fixes a bug where invalid entity annotation (no initial `B`) was
returned for an initial partial entity.

* Check for empty span in ents conversion

Note: `Span.as_doc()` will still fail on an empty span due to failures
in `Span.vector`.
---
 spacy/tests/doc/test_span.py | 15 ++++++++++++++-
 spacy/tokens/span.pyx        | 18 ++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 6a5689971..45d9c9aa0 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -14,9 +14,11 @@ def doc(en_tokenizer):
     heads = [1, 1, 3, 1, 1, 6, 6, 8, 6, 6, 12, 12, 12, 12]
     deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det",
             "attr", "punct", "ROOT", "det", "npadvmod", "punct"]
+    ents = ["O", "O", "B-ENT", "I-ENT", "I-ENT", "I-ENT", "I-ENT", "O", "O",
+            "O", "O", "O", "O", "O"]
     # fmt: on
     tokens = en_tokenizer(text)
-    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+    return Doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, ents=ents)
 
 
 @pytest.fixture
@@ -220,6 +222,17 @@ def test_span_as_doc(doc):
     assert span_doc is not doc
     assert span_doc[0].idx == 0
 
+    # partial initial entity is removed
+    assert len(span_doc.ents) == 0
+
+    # full entity is preserved
+    span_doc = doc[2:10].as_doc()
+    assert len(span_doc.ents) == 1
+
+    # partial final entity is removed
+    span_doc = doc[0:5].as_doc()
+    assert len(span_doc.ents) == 0
+
 
 @pytest.mark.usefixtures("clean_underscore")
 def test_span_as_doc_user_data(doc):
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 614d8fda5..05bbb8cc5 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -228,7 +228,25 @@ cdef class Span:
         array = self.doc.to_array(array_head)
         array = array[self.start : self.end]
         self._fix_dep_copy(array_head, array)
+        # Fix initial IOB so the entities are valid for doc.ents below.
+        if len(array) > 0 and ENT_IOB in array_head:
+            ent_iob_col = array_head.index(ENT_IOB)
+            if array[0][ent_iob_col] == 1:
+                array[0][ent_iob_col] = 3
         doc.from_array(array_head, array)
+        # Set partial entities at the beginning or end of the span to have
+        # missing entity annotation. Note: the initial partial entity could be
+        # detected from the IOB annotation but the final partial entity can't,
+        # so detect and remove both in the same way by checking self.ents.
+        span_ents = {(ent.start, ent.end) for ent in self.ents}
+        doc_ents = doc.ents
+        if len(doc_ents) > 0:
+            # Remove initial partial ent
+            if (doc_ents[0].start + self.start, doc_ents[0].end + self.start) not in span_ents:
+                doc.set_ents([], missing=[doc_ents[0]], default="unmodified")
+            # Remove final partial ent
+            if (doc_ents[-1].start + self.start, doc_ents[-1].end + self.start) not in span_ents:
+                doc.set_ents([], missing=[doc_ents[-1]], default="unmodified")
         doc.noun_chunks_iterator = self.doc.noun_chunks_iterator
         doc.user_hooks = self.doc.user_hooks
         doc.user_span_hooks = self.doc.user_span_hooks

From 7bba9cdc143ddefb0a6047c2573f022396024936 Mon Sep 17 00:00:00 2001
From: "Frederic R. Hopp" <fhopp@umail.ucsb.edu>
Date: Tue, 11 May 2021 19:18:19 -0700
Subject: [PATCH 144/146] Update universe.json

---
 website/meta/universe.json | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 721de1eb5..5b14b54be 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -25,6 +25,32 @@
             },
             "category": ["standalone"],
             "tags": ["person-name-detection"]
+        },
+	{
+            "id": "eMFDscore",
+            "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",
+            "slogan": "Extended Moral Foundation Dictionary Scoring for Python",
+            "description": "Fast, flexible extraction of moral information from textual input data.",
+            "github": "https://github.com/medianeuroscience/emfdscore",
+            "code_example": [
+                "from emfdscore.scoring import score_docs",
+                "import pandas as pd",
+                "template_input = pd.read_csv('emfdscore/template_input.csv', header=None)",
+                "DICT_TYPE = 'emfd'",
+                "PROB_MAP = 'single'",
+                "SCORE_METHOD = 'bow'",
+		"OUT_METRICS = 'vice-virtue'",
+		"OUT_CSV_PATH = 'single-vv.csv'",
+		"df = score_docs(template_input,DICT_TYPE,PROB_MAP,SCORE_METHOD,OUT_METRICS,num_docs)",
+            ],
+            "code_language": "python",
+            "author": "Media Neuroscience Lab",
+            "author_links": {
+            "github": "medianeuroscience",
+            "twitter": "medianeuro"
+            },
+            "category": ["research", "teaching"],
+            "tags": ["morality", "dictionary", "sentiment"]
         },
 	      {
             "id": "skweak",

From a9ca221e03544a7bda5ef5032e7c2fd63a8b52b5 Mon Sep 17 00:00:00 2001
From: "Frederic R. Hopp" <fhopp@umail.ucsb.edu>
Date: Wed, 12 May 2021 09:20:17 -0700
Subject: [PATCH 145/146] Update universe.json

Added more detailed description to eMFDscore project
---
 website/meta/universe.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 5b14b54be..533193487 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -30,7 +30,7 @@
             "id": "eMFDscore",
             "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",
             "slogan": "Extended Moral Foundation Dictionary Scoring for Python",
-            "description": "Fast, flexible extraction of moral information from textual input data.",
+            "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is build on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.",
             "github": "https://github.com/medianeuroscience/emfdscore",
             "code_example": [
                 "from emfdscore.scoring import score_docs",

From c5962b9fba12fefa68870416bd637af7e6b74499 Mon Sep 17 00:00:00 2001
From: "Frederic R. Hopp" <fhopp@umail.ucsb.edu>
Date: Thu, 13 May 2021 07:40:05 -0700
Subject: [PATCH 146/146] Update universe.json

fixed typo
---
 website/meta/universe.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/meta/universe.json b/website/meta/universe.json
index 533193487..87328074a 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -30,7 +30,7 @@
             "id": "eMFDscore",
             "title": "eMFDscore : Extended Moral Foundation Dictionary Scoring for Python",
             "slogan": "Extended Moral Foundation Dictionary Scoring for Python",
-            "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is build on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.",
+            "description": "eMFDscore is a library for the fast and flexible extraction of various moral information metrics from textual input data. eMFDscore is built on spaCy for faster execution and performs minimal preprocessing consisting of tokenization, syntactic dependency parsing, lower-casing, and stopword/punctuation/whitespace removal. eMFDscore lets users score documents with multiple Moral Foundations Dictionaries, provides various metrics for analyzing moral information, and extracts moral patient, agent, and attribute words related to entities.",
             "github": "https://github.com/medianeuroscience/emfdscore",
             "code_example": [
                 "from emfdscore.scoring import score_docs",