diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 16e257ce2..cfa126cc4 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -297,9 +297,7 @@ def ensure_pathy(path):
     return Pathy(path)
 
 
-def git_sparse_checkout(
-    repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
-):
+def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
     if dest.exists():
         msg.fail("Destination of checkout must not exist", exits=1)
     if not dest.parent.exists():
@@ -323,21 +321,30 @@ def git_sparse_checkout(
         # This is the "clone, but don't download anything" part.
         cmd = (
             f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
-            "--filter=blob:none"  # <-- The key bit
+            f"--filter=blob:none "  # <-- The key bit
+            f"-b {branch}"
         )
-        if branch is not None:
-            cmd = f"{cmd} -b {branch}"
         run_command(cmd, capture=True)
         # Now we need to find the missing filenames for the subpath we want.
         # Looking for this 'rev-list' command in the git --help? Hah.
         cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
         ret = run_command(cmd, capture=True)
-        missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
+        repo = _from_http_to_git(repo)
         # Now pass those missings into another bit of git internals
-        run_command(
-            f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
-        )
+        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
+        cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}"
+        run_command(cmd, capture=True)
         # And finally, we can checkout our subpath
-        run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
+        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
+        run_command(cmd)
         # We need Path(name) to make sure we also support subdirectories
         shutil.move(str(tmp_dir / Path(subpath)), str(dest))
+
+
+def _from_http_to_git(repo):
+    if repo.startswith("http://"):
+        repo = repo.replace(r"http://", r"https://")
+    if repo.startswith(r"https://"):
+        repo = repo.replace("https://", "git@").replace("/", ":", 1)
+        repo = f"{repo}.git"
+    return repo
diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py
index 7f9a46a46..751c389bc 100644
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@@ -43,7 +43,7 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N
         git_sparse_checkout(repo, name, dest)
     except subprocess.CalledProcessError:
         err = f"Could not clone '{name}' from repo '{repo_name}'"
-        msg.fail(err)
+        msg.fail(err, exits=1)
     msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
     if not (project_dir / PROJECT_FILE).exists():
         msg.warn(f"No {PROJECT_FILE} found in directory")
@@ -78,6 +78,7 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
     if not dest.parent.exists():
         # We're not creating parents, parent dir should exist
         msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
+            f"Create the necessary folder(s) first before continuing.",
             exits=1,
         )
diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
index 597f01b65..f827cd677 100644
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 
 _currency = r"\$¢£€¥฿"
 _quotes = CONCAT_QUOTES.replace("'", "")
+_units = UNITS.replace("%", "")
 
 _prefixes = (
     LIST_PUNCT
@@ -26,7 +27,7 @@ _suffixes = (
         r"(?<=[0-9])\+",
         r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:[{c}])".format(c=_currency),
-        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9])(?:{u})".format(u=_units),
         r"(?<=[{al}{e}{q}(?:{c})])\.".format(
             al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
         ),
diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd
index 0d0dd8c05..d31430124 100644
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@@ -42,6 +42,7 @@ cdef cppclass StateC:
     RingBufferC _hist
     int length
     int offset
+    int n_pushes
     int _s_i
     int _b_i
     int _e_i
@@ -49,6 +50,7 @@ cdef cppclass StateC:
 
     __init__(const TokenC* sent, int length) nogil:
         cdef int PADDING = 5
+        this.n_pushes = 0
         this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
         this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
         this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
@@ -335,6 +337,7 @@ cdef cppclass StateC:
             this.set_break(this.B_(0).l_edge)
         if this._b_i > this._break:
             this._break = -1
+        this.n_pushes += 1
 
     void pop() nogil:
         if this._s_i >= 1:
@@ -351,6 +354,7 @@ cdef cppclass StateC:
         this._buffer[this._b_i] = this.S(0)
         this._s_i -= 1
         this.shifted[this.B(0)] = True
+        this.n_pushes -= 1
 
     void add_arc(int head, int child, attr_t label) nogil:
         if this.has_head(child):
@@ -431,6 +435,7 @@ cdef cppclass StateC:
         this._break = src._break
         this.offset = src.offset
         this._empty_token = src._empty_token
+        this.n_pushes = src.n_pushes
 
     void fast_forward() nogil:
         # space token attachement policy:
diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx
index 880cf6cc5..d59ade467 100644
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@@ -36,6 +36,10 @@ cdef class StateClass:
             hist[i] = self.c.get_hist(i+1)
         return hist
 
+    @property
+    def n_pushes(self):
+        return self.c.n_pushes
+
     def is_final(self):
         return self.c.is_final()
 
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index c94cb6b58..f831caefe 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -289,7 +289,14 @@ class Tagger(Pipe):
             err = Errors.E1006.format(name="Tagger")
             raise ValueError(err)
         self.set_output(len(self.labels))
-        self.model.initialize(X=doc_sample)
+        if doc_sample:
+            label_sample = [
+                self.model.ops.alloc2f(len(doc), len(self.labels))
+                for doc in doc_sample
+            ]
+            self.model.initialize(X=doc_sample, Y=label_sample)
+        else:
+            self.model.initialize()
         if sgd is None:
             sgd = self.create_optimizer()
         return sgd
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index 2eadfa6aa..2169b4c17 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -279,14 +279,14 @@ cdef class Parser(Pipe):
             # Chop sequences into lengths of this many transitions, to make the
             # batch uniform length.
             # We used to randomize this, but it's not clear that actually helps?
-            cut_size = self.cfg["update_with_oracle_cut_size"]
-            states, golds, max_steps = self._init_gold_batch(
+            max_pushes = self.cfg["update_with_oracle_cut_size"]
+            states, golds, _ = self._init_gold_batch(
                 examples,
-                max_length=cut_size
+                max_length=max_pushes
             )
         else:
             states, golds, _ = self.moves.init_gold_batch(examples)
-            max_steps = max([len(eg.x) for eg in examples])
+            max_pushes = max([len(eg.x) for eg in examples])
         if not states:
             return losses
         all_states = list(states)
@@ -302,7 +302,8 @@ cdef class Parser(Pipe):
             backprop(d_scores)
             # Follow the predicted action
             self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            states_golds = [(s, g) for (s, g) in zip(states, golds)
+                            if s.n_pushes < max_pushes and not s.is_final()]
 
         backprop_tok2vec(golds)
         if sgd not in (None, False):
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 66c27b233..12ead90cb 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -84,9 +84,8 @@ def test_overfitting_IO():
     # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
     fix_random_seed(0)
     nlp = English()
-    textcat = nlp.add_pipe("textcat")
     # Set exclusive labels
-    textcat.model.attrs["multi_label"] = False
+    textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}})
     train_examples = []
     for text, annotations in TRAIN_DATA:
         train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@@ -103,9 +102,8 @@ def test_overfitting_IO():
     test_text = "I am happy."
     doc = nlp(test_text)
     cats = doc.cats
-    # note that by default, exclusive_classes = false so we need a bigger error margin
-    assert cats["POSITIVE"] > 0.8
-    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
+    assert cats["POSITIVE"] > 0.9
+    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
 
     # Also test the results are still the same after IO
     with make_tempdir() as tmp_dir:
@@ -113,8 +111,8 @@ def test_overfitting_IO():
         nlp2 = util.load_model_from_path(tmp_dir)
         doc2 = nlp2(test_text)
         cats2 = doc2.cats
-        assert cats2["POSITIVE"] > 0.8
-        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
+        assert cats2["POSITIVE"] > 0.9
+        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
 
     # Test scoring
     scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md
index aca9a76e5..3ef28acaf 100644
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@@ -3,8 +3,9 @@ title: Layers and Model Architectures
 teaser: Power spaCy components with custom neural networks
 menu:
   - ['Type Signatures', 'type-sigs']
-  - ['Defining Sublayers', 'sublayers']
+  - ['Swapping Architectures', 'swap-architectures']
   - ['PyTorch & TensorFlow', 'frameworks']
+  - ['Thinc Models', 'thinc']
   - ['Trainable Components', 'components']
 next: /usage/projects
 ---
@@ -22,8 +23,6 @@ its model architecture. The architecture is like a recipe for the network, and
 you can't change the recipe once the dish has already been prepared. You have to
 make a new one.
 
-![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
-
 ## Type signatures {#type-sigs}
 
 <!-- TODO: update example, maybe simplify definition? -->
@@ -92,9 +91,13 @@ code.
 
 </Infobox>
 
-## Defining sublayers {#sublayers}
+## Swapping model architectures {#swap-architectures}
 
-Model architecture functions often accept **sublayers as arguments**, so that
+<!-- TODO: textcat example, using different architecture in the config -->
+
+### Defining sublayers {#sublayers}
+
+​Model architecture functions often accept **sublayers as arguments**, so that
 you can try **substituting a different layer** into the network. Depending on
 how the architecture function is structured, you might be able to define your
 network structure entirely through the [config system](/usage/training#config),
@@ -114,62 +117,37 @@ approaches. And if you want to define your own solution, all you need to do is
 register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and
 you'll be able to try it out in any of the spaCy components. ​
 
-<!-- TODO: example of switching sublayers -->
-
-### Registering new architectures
-
-- Recap concept, link to config docs. ​
+<!-- TODO: example of swapping sublayers -->
 
 ## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks}
 
-<!-- TODO: this is copied over from the Thinc docs and we probably want to shorten it and make it more spaCy-specific -->
+Thinc allows you to [wrap models](https://thinc.ai/docs/usage-frameworks)
+written in other machine learning frameworks like PyTorch, TensorFlow and MXNet
+using a unified [`Model`](https://thinc.ai/docs/api-model) API. As well as
+**wrapping whole models**, Thinc lets you call into an external framework for
+just **part of your model**: you can have a model where you use PyTorch just for
+the transformer layers, using "native" Thinc layers to do fiddly input and
+output transformations and add on task-specific "heads", as efficiency is less
+of a consideration for those parts of the network.
 
-Thinc allows you to wrap models written in other machine learning frameworks
-like PyTorch, TensorFlow and MXNet using a unified
-[`Model`](https://thinc.ai/docs/api-model) API. As well as **wrapping whole
-models**, Thinc lets you call into an external framework for just **part of your
-model**: you can have a model where you use PyTorch just for the transformer
-layers, using "native" Thinc layers to do fiddly input and output
-transformations and add on task-specific "heads", as efficiency is less of a
-consideration for those parts of the network.
+<!-- TODO: custom tagger implemented in PyTorch, wrapped as Thinc model, link off to project (with notebook?) -->
 
-Thinc uses a special class, [`Shim`](https://thinc.ai/docs/api-model#shim), to
-hold references to external objects. This allows each wrapper space to define a
-custom type, with whatever attributes and methods are helpful, to assist in
-managing the communication between Thinc and the external library. The
-[`Model`](https://thinc.ai/docs/api-model#model) class holds `shim` instances in
-a separate list, and communicates with the shims about updates, serialization,
-changes of device, etc.
+## Implementing models in Thinc {#thinc}
 
-The wrapper will receive each batch of inputs, convert them into a suitable form
-for the underlying model instance, and pass them over to the shim, which will
-**manage the actual communication** with the model. The output is then passed
-back into the wrapper, and converted for use in the rest of the network. The
-equivalent procedure happens during backpropagation. Array conversion is handled
-via the [DLPack](https://github.com/dmlc/dlpack) standard wherever possible, so
-that data can be passed between the frameworks **without copying the data back**
-to the host device unnecessarily.
-
-| Framework      | Wrapper layer                                                             | Shim                                                      | DLPack          |
-| -------------- | ------------------------------------------------------------------------- | --------------------------------------------------------- | --------------- |
-| **PyTorch**    | [`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper)       | [`PyTorchShim`](https://thinc.ai/docs/api-model#shims)    | ✅              |
-| **TensorFlow** | [`TensorFlowWrapper`](https://thinc.ai/docs/api-layers#tensorflowwrapper) | [`TensorFlowShim`](https://thinc.ai/docs/api-model#shims) | ❌ <sup>1</sup> |
-| **MXNet**      | [`MXNetWrapper`](https://thinc.ai/docs/api-layers#mxnetwrapper)           | [`MXNetShim`](https://thinc.ai/docs/api-model#shims)      | ✅              |
-
-1. DLPack support in TensorFlow is now
-   [available](<(https://github.com/tensorflow/tensorflow/issues/24453)>) but
-   still experimental.
-
-<!-- TODO:
-- Explain concept
-- Link off to notebook ​
--->
+<!-- TODO: use same example as above, custom tagger, but implemented in Thinc, link off to Thinc docs where appropriate -->
 
 ## Models for trainable components {#components}
 
+<!-- TODO:
+
 - Interaction with `predict`, `get_loss` and `set_annotations`
 - Initialization life-cycle with `begin_training`.
-- Link to relation extraction notebook.
+
+Example: relation extraction component (implemented as project template)
+
+-->
+
+![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
 
 ```python
 def update(self, examples):