diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index 16e257ce2..cfa126cc4 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -297,9 +297,7 @@ def ensure_pathy(path): return Pathy(path) -def git_sparse_checkout( - repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None -): +def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"): if dest.exists(): msg.fail("Destination of checkout must not exist", exits=1) if not dest.parent.exists(): @@ -323,21 +321,30 @@ def git_sparse_checkout( # This is the "clone, but don't download anything" part. cmd = ( f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " - "--filter=blob:none" # <-- The key bit + f"--filter=blob:none " # <-- The key bit + f"-b {branch}" ) - if branch is not None: - cmd = f"{cmd} -b {branch}" run_command(cmd, capture=True) # Now we need to find the missing filenames for the subpath we want. # Looking for this 'rev-list' command in the git --help? Hah. cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" ret = run_command(cmd, capture=True) - missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) + repo = _from_http_to_git(repo) # Now pass those missings into another bit of git internals - run_command( - f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings - ) + missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) + cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}" + run_command(cmd, capture=True) # And finally, we can checkout our subpath - run_command(f"git -C {tmp_dir} checkout {branch} {subpath}") + cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" + run_command(cmd) # We need Path(name) to make sure we also support subdirectories shutil.move(str(tmp_dir / Path(subpath)), str(dest)) + + +def _from_http_to_git(repo): + if repo.startswith("http://"): + repo = repo.replace(r"http://", r"https://") + if repo.startswith(r"https://"): + repo = repo.replace("https://", "git@").replace("/", ":", 1) + repo = f"{repo}.git" + return repo diff --git a/spacy/cli/project/clone.py b/spacy/cli/project/clone.py index 7f9a46a46..751c389bc 100644 --- a/spacy/cli/project/clone.py +++ b/spacy/cli/project/clone.py @@ -43,7 +43,7 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N git_sparse_checkout(repo, name, dest) except subprocess.CalledProcessError: err = f"Could not clone '{name}' from repo '{repo_name}'" - msg.fail(err) + msg.fail(err, exits=1) msg.good(f"Cloned '{name}' from {repo_name}", project_dir) if not (project_dir / PROJECT_FILE).exists(): msg.warn(f"No {PROJECT_FILE} found in directory") @@ -78,6 +78,7 @@ def check_clone(name: str, dest: Path, repo: str) -> None: if not dest.parent.exists(): # We're not creating parents, parent dir should exist msg.fail( - f"Can't clone project, parent directory doesn't exist: {dest.parent}", + f"Can't clone project, parent directory doesn't exist: {dest.parent}. " + f"Create the necessary folder(s) first before continuing.", exits=1, ) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 597f01b65..f827cd677 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") +_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -26,7 +27,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9])(?:{u})".format(u=_units), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ), diff --git a/spacy/pipeline/_parser_internals/_state.pxd b/spacy/pipeline/_parser_internals/_state.pxd index 0d0dd8c05..d31430124 100644 --- a/spacy/pipeline/_parser_internals/_state.pxd +++ b/spacy/pipeline/_parser_internals/_state.pxd @@ -42,6 +42,7 @@ cdef cppclass StateC: RingBufferC _hist int length int offset + int n_pushes int _s_i int _b_i int _e_i @@ -49,6 +50,7 @@ cdef cppclass StateC: __init__(const TokenC* sent, int length) nogil: cdef int PADDING = 5 + this.n_pushes = 0 this._buffer = calloc(length + (PADDING * 2), sizeof(int)) this._stack = calloc(length + (PADDING * 2), sizeof(int)) this.shifted = calloc(length + (PADDING * 2), sizeof(bint)) @@ -335,6 +337,7 @@ cdef cppclass StateC: this.set_break(this.B_(0).l_edge) if this._b_i > this._break: this._break = -1 + this.n_pushes += 1 void pop() nogil: if this._s_i >= 1: @@ -351,6 +354,7 @@ cdef cppclass StateC: this._buffer[this._b_i] = this.S(0) this._s_i -= 1 this.shifted[this.B(0)] = True + this.n_pushes -= 1 void add_arc(int head, int child, attr_t label) nogil: if this.has_head(child): @@ -431,6 +435,7 @@ cdef cppclass StateC: this._break = src._break this.offset = src.offset this._empty_token = src._empty_token + this.n_pushes = src.n_pushes void fast_forward() nogil: # space token attachement policy: diff --git a/spacy/pipeline/_parser_internals/stateclass.pyx b/spacy/pipeline/_parser_internals/stateclass.pyx index 880cf6cc5..d59ade467 100644 --- a/spacy/pipeline/_parser_internals/stateclass.pyx +++ b/spacy/pipeline/_parser_internals/stateclass.pyx @@ -36,6 +36,10 @@ cdef class StateClass: hist[i] = self.c.get_hist(i+1) return hist + @property + def n_pushes(self): + return self.c.n_pushes + def is_final(self): return self.c.is_final() diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx index c94cb6b58..f831caefe 100644 --- a/spacy/pipeline/tagger.pyx +++ b/spacy/pipeline/tagger.pyx @@ -289,7 +289,14 @@ class Tagger(Pipe): err = Errors.E1006.format(name="Tagger") raise ValueError(err) self.set_output(len(self.labels)) - self.model.initialize(X=doc_sample) + if doc_sample: + label_sample = [ + self.model.ops.alloc2f(len(doc), len(self.labels)) + for doc in doc_sample + ] + self.model.initialize(X=doc_sample, Y=label_sample) + else: + self.model.initialize() if sgd is None: sgd = self.create_optimizer() return sgd diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx index 2eadfa6aa..2169b4c17 100644 --- a/spacy/pipeline/transition_parser.pyx +++ b/spacy/pipeline/transition_parser.pyx @@ -279,14 +279,14 @@ cdef class Parser(Pipe): # Chop sequences into lengths of this many transitions, to make the # batch uniform length. # We used to randomize this, but it's not clear that actually helps? - cut_size = self.cfg["update_with_oracle_cut_size"] - states, golds, max_steps = self._init_gold_batch( + max_pushes = self.cfg["update_with_oracle_cut_size"] + states, golds, _ = self._init_gold_batch( examples, - max_length=cut_size + max_length=max_pushes ) else: states, golds, _ = self.moves.init_gold_batch(examples) - max_steps = max([len(eg.x) for eg in examples]) + max_pushes = max([len(eg.x) for eg in examples]) if not states: return losses all_states = list(states) @@ -302,7 +302,8 @@ cdef class Parser(Pipe): backprop(d_scores) # Follow the predicted action self.transition_states(states, scores) - states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()] + states_golds = [(s, g) for (s, g) in zip(states, golds) + if s.n_pushes < max_pushes and not s.is_final()] backprop_tok2vec(golds) if sgd not in (None, False): diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 66c27b233..12ead90cb 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -84,9 +84,8 @@ def test_overfitting_IO(): # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly fix_random_seed(0) nlp = English() - textcat = nlp.add_pipe("textcat") # Set exclusive labels - textcat.model.attrs["multi_label"] = False + textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}}) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) @@ -103,9 +102,8 @@ def test_overfitting_IO(): test_text = "I am happy." doc = nlp(test_text) cats = doc.cats - # note that by default, exclusive_classes = false so we need a bigger error margin - assert cats["POSITIVE"] > 0.8 - assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1) + assert cats["POSITIVE"] > 0.9 + assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001) # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -113,8 +111,8 @@ def test_overfitting_IO(): nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) cats2 = doc2.cats - assert cats2["POSITIVE"] > 0.8 - assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1) + assert cats2["POSITIVE"] > 0.9 + assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001) # Test scoring scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"}) diff --git a/website/docs/usage/layers-architectures.md b/website/docs/usage/layers-architectures.md index aca9a76e5..3ef28acaf 100644 --- a/website/docs/usage/layers-architectures.md +++ b/website/docs/usage/layers-architectures.md @@ -3,8 +3,9 @@ title: Layers and Model Architectures teaser: Power spaCy components with custom neural networks menu: - ['Type Signatures', 'type-sigs'] - - ['Defining Sublayers', 'sublayers'] + - ['Swapping Architectures', 'swap-architectures'] - ['PyTorch & TensorFlow', 'frameworks'] + - ['Thinc Models', 'thinc'] - ['Trainable Components', 'components'] next: /usage/projects --- @@ -22,8 +23,6 @@ its model architecture. The architecture is like a recipe for the network, and you can't change the recipe once the dish has already been prepared. You have to make a new one. -![Diagram of a pipeline component with its model](../images/layers-architectures.svg) - ## Type signatures {#type-sigs} @@ -92,9 +91,13 @@ code. -## Defining sublayers {#sublayers} +## Swapping model architectures {#swap-architectures} -Model architecture functions often accept **sublayers as arguments**, so that + + +### Defining sublayers {#sublayers} + +​Model architecture functions often accept **sublayers as arguments**, so that you can try **substituting a different layer** into the network. Depending on how the architecture function is structured, you might be able to define your network structure entirely through the [config system](/usage/training#config), @@ -114,62 +117,37 @@ approaches. And if you want to define your own solution, all you need to do is register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and you'll be able to try it out in any of the spaCy components. ​ - - -### Registering new architectures - -- Recap concept, link to config docs. ​ + ## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks} - +Thinc allows you to [wrap models](https://thinc.ai/docs/usage-frameworks) +written in other machine learning frameworks like PyTorch, TensorFlow and MXNet +using a unified [`Model`](https://thinc.ai/docs/api-model) API. As well as +**wrapping whole models**, Thinc lets you call into an external framework for +just **part of your model**: you can have a model where you use PyTorch just for +the transformer layers, using "native" Thinc layers to do fiddly input and +output transformations and add on task-specific "heads", as efficiency is less +of a consideration for those parts of the network. -Thinc allows you to wrap models written in other machine learning frameworks -like PyTorch, TensorFlow and MXNet using a unified -[`Model`](https://thinc.ai/docs/api-model) API. As well as **wrapping whole -models**, Thinc lets you call into an external framework for just **part of your -model**: you can have a model where you use PyTorch just for the transformer -layers, using "native" Thinc layers to do fiddly input and output -transformations and add on task-specific "heads", as efficiency is less of a -consideration for those parts of the network. + -Thinc uses a special class, [`Shim`](https://thinc.ai/docs/api-model#shim), to -hold references to external objects. This allows each wrapper space to define a -custom type, with whatever attributes and methods are helpful, to assist in -managing the communication between Thinc and the external library. The -[`Model`](https://thinc.ai/docs/api-model#model) class holds `shim` instances in -a separate list, and communicates with the shims about updates, serialization, -changes of device, etc. +## Implementing models in Thinc {#thinc} -The wrapper will receive each batch of inputs, convert them into a suitable form -for the underlying model instance, and pass them over to the shim, which will -**manage the actual communication** with the model. The output is then passed -back into the wrapper, and converted for use in the rest of the network. The -equivalent procedure happens during backpropagation. Array conversion is handled -via the [DLPack](https://github.com/dmlc/dlpack) standard wherever possible, so -that data can be passed between the frameworks **without copying the data back** -to the host device unnecessarily. - -| Framework | Wrapper layer | Shim | DLPack | -| -------------- | ------------------------------------------------------------------------- | --------------------------------------------------------- | --------------- | -| **PyTorch** | [`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper) | [`PyTorchShim`](https://thinc.ai/docs/api-model#shims) | ✅ | -| **TensorFlow** | [`TensorFlowWrapper`](https://thinc.ai/docs/api-layers#tensorflowwrapper) | [`TensorFlowShim`](https://thinc.ai/docs/api-model#shims) | ❌ 1 | -| **MXNet** | [`MXNetWrapper`](https://thinc.ai/docs/api-layers#mxnetwrapper) | [`MXNetShim`](https://thinc.ai/docs/api-model#shims) | ✅ | - -1. DLPack support in TensorFlow is now - [available](<(https://github.com/tensorflow/tensorflow/issues/24453)>) but - still experimental. - - + ## Models for trainable components {#components} + + +![Diagram of a pipeline component with its model](../images/layers-architectures.svg) ```python def update(self, examples):