Merge remote-tracking branch 'upstream/develop' into feature/docs-layers

# Conflicts: # website/docs/usage/layers-architectures.md
2025-08-08 22:24:55 +03:00 · 2020-09-02 13:17:11 +02:00 · 2020-09-02 13:17:11 +02:00 · 422df9c2e2
commit 422df9c2e2
parent 474abb2e59 eb56377799
9 changed files with 79 additions and 77 deletions
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -297,9 +297,7 @@ def ensure_pathy(path):
    return Pathy(path)


-def git_sparse_checkout(
-    repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
-):
+def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
    if dest.exists():
        msg.fail("Destination of checkout must not exist", exits=1)
    if not dest.parent.exists():
@ -323,21 +321,30 @@ def git_sparse_checkout(
        # This is the "clone, but don't download anything" part.
        cmd = (
            f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
-            "--filter=blob:none"  # <-- The key bit
+            f"--filter=blob:none "  # <-- The key bit
+            f"-b {branch}"
        )
-        if branch is not None:
-            cmd = f"{cmd} -b {branch}"
        run_command(cmd, capture=True)
        # Now we need to find the missing filenames for the subpath we want.
        # Looking for this 'rev-list' command in the git --help? Hah.
        cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
        ret = run_command(cmd, capture=True)
-        missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
+        repo = _from_http_to_git(repo)
        # Now pass those missings into another bit of git internals
-        run_command(
-            f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
-        )
+        missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
+        cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}"
+        run_command(cmd, capture=True)
        # And finally, we can checkout our subpath
-        run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
+        cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
+        run_command(cmd)
        # We need Path(name) to make sure we also support subdirectories
        shutil.move(str(tmp_dir / Path(subpath)), str(dest))
+
+
+def _from_http_to_git(repo):
+    if repo.startswith("http://"):
+        repo = repo.replace(r"http://", r"https://")
+    if repo.startswith(r"https://"):
+        repo = repo.replace("https://", "git@").replace("/", ":", 1)
+        repo = f"{repo}.git"
+    return repo
--- a/spacy/cli/project/clone.py
+++ b/spacy/cli/project/clone.py
@ -43,7 +43,7 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N
        git_sparse_checkout(repo, name, dest)
    except subprocess.CalledProcessError:
        err = f"Could not clone '{name}' from repo '{repo_name}'"
-        msg.fail(err)
+        msg.fail(err, exits=1)
    msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
    if not (project_dir / PROJECT_FILE).exists():
        msg.warn(f"No {PROJECT_FILE} found in directory")
@ -78,6 +78,7 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
    if not dest.parent.exists():
        # We're not creating parents, parent dir should exist
        msg.fail(
-            f"Can't clone project, parent directory doesn't exist: {dest.parent}",
+            f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
+            f"Create the necessary folder(s) first before continuing.",
            exits=1,
        )
--- a/spacy/lang/hu/punctuation.py
+++ b/spacy/lang/hu/punctuation.py
@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")

 _currency = r"\$¢£€¥฿"
 _quotes = CONCAT_QUOTES.replace("'", "")
+_units = UNITS.replace("%", "")

 _prefixes = (
    LIST_PUNCT
@ -26,7 +27,7 @@ _suffixes = (
        r"(?<=[0-9])\+",
        r"(?<=°[FfCcKk])\.",
        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
-        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9])(?:{u})".format(u=_units),
        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
        ),
--- a/spacy/pipeline/_parser_internals/_state.pxd
+++ b/spacy/pipeline/_parser_internals/_state.pxd
@ -42,6 +42,7 @@ cdef cppclass StateC:
    RingBufferC _hist
    int length
    int offset
+    int n_pushes
    int _s_i
    int _b_i
    int _e_i
@ -49,6 +50,7 @@ cdef cppclass StateC:

    __init__(const TokenC* sent, int length) nogil:
        cdef int PADDING = 5
+        this.n_pushes = 0
        this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
        this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
@ -335,6 +337,7 @@ cdef cppclass StateC:
            this.set_break(this.B_(0).l_edge)
        if this._b_i > this._break:
            this._break = -1
+        this.n_pushes += 1

    void pop() nogil:
        if this._s_i >= 1:
@ -351,6 +354,7 @@ cdef cppclass StateC:
        this._buffer[this._b_i] = this.S(0)
        this._s_i -= 1
        this.shifted[this.B(0)] = True
+        this.n_pushes -= 1

    void add_arc(int head, int child, attr_t label) nogil:
        if this.has_head(child):
@ -431,6 +435,7 @@ cdef cppclass StateC:
        this._break = src._break
        this.offset = src.offset
        this._empty_token = src._empty_token
+        this.n_pushes = src.n_pushes

    void fast_forward() nogil:
        # space token attachement policy:
--- a/spacy/pipeline/_parser_internals/stateclass.pyx
+++ b/spacy/pipeline/_parser_internals/stateclass.pyx
@ -36,6 +36,10 @@ cdef class StateClass:
            hist[i] = self.c.get_hist(i+1)
        return hist

+    @property
+    def n_pushes(self):
+        return self.c.n_pushes
+
    def is_final(self):
        return self.c.is_final()

--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@ -289,7 +289,14 @@ class Tagger(Pipe):
            err = Errors.E1006.format(name="Tagger")
            raise ValueError(err)
        self.set_output(len(self.labels))
-        self.model.initialize(X=doc_sample)
+        if doc_sample:
+            label_sample = [
+                self.model.ops.alloc2f(len(doc), len(self.labels))
+                for doc in doc_sample
+            ]
+            self.model.initialize(X=doc_sample, Y=label_sample)
+        else:
+            self.model.initialize()
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@ -279,14 +279,14 @@ cdef class Parser(Pipe):
            # Chop sequences into lengths of this many transitions, to make the
            # batch uniform length.
            # We used to randomize this, but it's not clear that actually helps?
-            cut_size = self.cfg["update_with_oracle_cut_size"]
-            states, golds, max_steps = self._init_gold_batch(
+            max_pushes = self.cfg["update_with_oracle_cut_size"]
+            states, golds, _ = self._init_gold_batch(
                examples,
-                max_length=cut_size
+                max_length=max_pushes
            )
        else:
            states, golds, _ = self.moves.init_gold_batch(examples)
-            max_steps = max([len(eg.x) for eg in examples])
+            max_pushes = max([len(eg.x) for eg in examples])
        if not states:
            return losses
        all_states = list(states)
@ -302,7 +302,8 @@ cdef class Parser(Pipe):
            backprop(d_scores)
            # Follow the predicted action
            self.transition_states(states, scores)
-            states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
+            states_golds = [(s, g) for (s, g) in zip(states, golds)
+                            if s.n_pushes < max_pushes and not s.is_final()]

        backprop_tok2vec(golds)
        if sgd not in (None, False):
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -84,9 +84,8 @@ def test_overfitting_IO():
    # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
    fix_random_seed(0)
    nlp = English()
-    textcat = nlp.add_pipe("textcat")
    # Set exclusive labels
-    textcat.model.attrs["multi_label"] = False
+    textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}})
    train_examples = []
    for text, annotations in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -103,9 +102,8 @@ def test_overfitting_IO():
    test_text = "I am happy."
    doc = nlp(test_text)
    cats = doc.cats
-    # note that by default, exclusive_classes = false so we need a bigger error margin
-    assert cats["POSITIVE"] > 0.8
-    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
+    assert cats["POSITIVE"] > 0.9
+    assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
@ -113,8 +111,8 @@ def test_overfitting_IO():
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        cats2 = doc2.cats
-        assert cats2["POSITIVE"] > 0.8
-        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
+        assert cats2["POSITIVE"] > 0.9
+        assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)

    # Test scoring
    scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
--- a/website/docs/usage/layers-architectures.md
+++ b/website/docs/usage/layers-architectures.md
@ -3,8 +3,9 @@ title: Layers and Model Architectures
 teaser: Power spaCy components with custom neural networks
 menu:
  - ['Type Signatures', 'type-sigs']
-  - ['Defining Sublayers', 'sublayers']
+  - ['Swapping Architectures', 'swap-architectures']
  - ['PyTorch & TensorFlow', 'frameworks']
+  - ['Thinc Models', 'thinc']
  - ['Trainable Components', 'components']
 next: /usage/projects
 ---
@ -22,8 +23,6 @@ its model architecture. The architecture is like a recipe for the network, and
 you can't change the recipe once the dish has already been prepared. You have to
 make a new one.

-![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
-
 ## Type signatures {#type-sigs}

 <!-- TODO: update example, maybe simplify definition? -->
@ -92,9 +91,13 @@ code.

 </Infobox>

-## Defining sublayers {#sublayers}
+## Swapping model architectures {#swap-architectures}

-Model architecture functions often accept **sublayers as arguments**, so that
+<!-- TODO: textcat example, using different architecture in the config -->
+
+### Defining sublayers {#sublayers}
+
+Model architecture functions often accept **sublayers as arguments**, so that
 you can try **substituting a different layer** into the network. Depending on
 how the architecture function is structured, you might be able to define your
 network structure entirely through the [config system](/usage/training#config),
@ -114,62 +117,37 @@ approaches. And if you want to define your own solution, all you need to do is
 register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and
 you'll be able to try it out in any of the spaCy components. 

-<!-- TODO: example of switching sublayers -->
-
-### Registering new architectures
-
- Recap concept, link to config docs. 
+<!-- TODO: example of swapping sublayers -->

 ## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks}

-<!-- TODO: this is copied over from the Thinc docs and we probably want to shorten it and make it more spaCy-specific -->
+Thinc allows you to [wrap models](https://thinc.ai/docs/usage-frameworks)
+written in other machine learning frameworks like PyTorch, TensorFlow and MXNet
+using a unified [`Model`](https://thinc.ai/docs/api-model) API. As well as
+**wrapping whole models**, Thinc lets you call into an external framework for
+just **part of your model**: you can have a model where you use PyTorch just for
+the transformer layers, using "native" Thinc layers to do fiddly input and
+output transformations and add on task-specific "heads", as efficiency is less
+of a consideration for those parts of the network.

-Thinc allows you to wrap models written in other machine learning frameworks
-like PyTorch, TensorFlow and MXNet using a unified
-[`Model`](https://thinc.ai/docs/api-model) API. As well as **wrapping whole
-models**, Thinc lets you call into an external framework for just **part of your
-model**: you can have a model where you use PyTorch just for the transformer
-layers, using "native" Thinc layers to do fiddly input and output
-transformations and add on task-specific "heads", as efficiency is less of a
-consideration for those parts of the network.
+<!-- TODO: custom tagger implemented in PyTorch, wrapped as Thinc model, link off to project (with notebook?) -->

-Thinc uses a special class, [`Shim`](https://thinc.ai/docs/api-model#shim), to
-hold references to external objects. This allows each wrapper space to define a
-custom type, with whatever attributes and methods are helpful, to assist in
-managing the communication between Thinc and the external library. The
-[`Model`](https://thinc.ai/docs/api-model#model) class holds `shim` instances in
-a separate list, and communicates with the shims about updates, serialization,
-changes of device, etc.
+## Implementing models in Thinc {#thinc}

-The wrapper will receive each batch of inputs, convert them into a suitable form
-for the underlying model instance, and pass them over to the shim, which will
-**manage the actual communication** with the model. The output is then passed
-back into the wrapper, and converted for use in the rest of the network. The
-equivalent procedure happens during backpropagation. Array conversion is handled
-via the [DLPack](https://github.com/dmlc/dlpack) standard wherever possible, so
-that data can be passed between the frameworks **without copying the data back**
-to the host device unnecessarily.
-
-| Framework      | Wrapper layer                                                             | Shim                                                      | DLPack          |
-| -------------- | ------------------------------------------------------------------------- | --------------------------------------------------------- | --------------- |
-| **PyTorch**    | [`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper)       | [`PyTorchShim`](https://thinc.ai/docs/api-model#shims)    | ✅              |
-| **TensorFlow** | [`TensorFlowWrapper`](https://thinc.ai/docs/api-layers#tensorflowwrapper) | [`TensorFlowShim`](https://thinc.ai/docs/api-model#shims) | ❌ <sup>1</sup> |
-| **MXNet**      | [`MXNetWrapper`](https://thinc.ai/docs/api-layers#mxnetwrapper)           | [`MXNetShim`](https://thinc.ai/docs/api-model#shims)      | ✅              |
-
-1. DLPack support in TensorFlow is now
-   [available](<(https://github.com/tensorflow/tensorflow/issues/24453)>) but
-   still experimental.
-
-<!-- TODO:
- Explain concept
- Link off to notebook 
-->
+<!-- TODO: use same example as above, custom tagger, but implemented in Thinc, link off to Thinc docs where appropriate -->

 ## Models for trainable components {#components}

+<!-- TODO:
+
 - Interaction with `predict`, `get_loss` and `set_annotations`
 - Initialization life-cycle with `begin_training`.
- Link to relation extraction notebook.
+
+Example: relation extraction component (implemented as project template)
+
+-->
+
+![Diagram of a pipeline component with its model](../images/layers-architectures.svg)

 ```python
 def update(self, examples):