mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge remote-tracking branch 'upstream/develop' into feature/docs-layers
# Conflicts: # website/docs/usage/layers-architectures.md
This commit is contained in:
commit
422df9c2e2
|
@ -297,9 +297,7 @@ def ensure_pathy(path):
|
|||
return Pathy(path)
|
||||
|
||||
|
||||
def git_sparse_checkout(
|
||||
repo: str, subpath: str, dest: Path, *, branch: Optional[str] = None
|
||||
):
|
||||
def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "master"):
|
||||
if dest.exists():
|
||||
msg.fail("Destination of checkout must not exist", exits=1)
|
||||
if not dest.parent.exists():
|
||||
|
@ -323,21 +321,30 @@ def git_sparse_checkout(
|
|||
# This is the "clone, but don't download anything" part.
|
||||
cmd = (
|
||||
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
|
||||
"--filter=blob:none" # <-- The key bit
|
||||
f"--filter=blob:none " # <-- The key bit
|
||||
f"-b {branch}"
|
||||
)
|
||||
if branch is not None:
|
||||
cmd = f"{cmd} -b {branch}"
|
||||
run_command(cmd, capture=True)
|
||||
# Now we need to find the missing filenames for the subpath we want.
|
||||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
missings = "\n".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
repo = _from_http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
run_command(
|
||||
f"git -C {tmp_dir} fetch-pack --stdin {repo}", capture=True, stdin=missings
|
||||
)
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}"
|
||||
run_command(cmd, capture=True)
|
||||
# And finally, we can checkout our subpath
|
||||
run_command(f"git -C {tmp_dir} checkout {branch} {subpath}")
|
||||
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
|
||||
run_command(cmd)
|
||||
# We need Path(name) to make sure we also support subdirectories
|
||||
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
|
||||
|
||||
|
||||
def _from_http_to_git(repo):
|
||||
if repo.startswith("http://"):
|
||||
repo = repo.replace(r"http://", r"https://")
|
||||
if repo.startswith(r"https://"):
|
||||
repo = repo.replace("https://", "git@").replace("/", ":", 1)
|
||||
repo = f"{repo}.git"
|
||||
return repo
|
||||
|
|
|
@ -43,7 +43,7 @@ def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> N
|
|||
git_sparse_checkout(repo, name, dest)
|
||||
except subprocess.CalledProcessError:
|
||||
err = f"Could not clone '{name}' from repo '{repo_name}'"
|
||||
msg.fail(err)
|
||||
msg.fail(err, exits=1)
|
||||
msg.good(f"Cloned '{name}' from {repo_name}", project_dir)
|
||||
if not (project_dir / PROJECT_FILE).exists():
|
||||
msg.warn(f"No {PROJECT_FILE} found in directory")
|
||||
|
@ -78,6 +78,7 @@ def check_clone(name: str, dest: Path, repo: str) -> None:
|
|||
if not dest.parent.exists():
|
||||
# We're not creating parents, parent dir should exist
|
||||
msg.fail(
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}",
|
||||
f"Can't clone project, parent directory doesn't exist: {dest.parent}. "
|
||||
f"Create the necessary folder(s) first before continuing.",
|
||||
exits=1,
|
||||
)
|
||||
|
|
|
@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
|||
|
||||
_currency = r"\$¢£€¥฿"
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
_units = UNITS.replace("%", "")
|
||||
|
||||
_prefixes = (
|
||||
LIST_PUNCT
|
||||
|
@ -26,7 +27,7 @@ _suffixes = (
|
|||
r"(?<=[0-9])\+",
|
||||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||
),
|
||||
|
|
|
@ -42,6 +42,7 @@ cdef cppclass StateC:
|
|||
RingBufferC _hist
|
||||
int length
|
||||
int offset
|
||||
int n_pushes
|
||||
int _s_i
|
||||
int _b_i
|
||||
int _e_i
|
||||
|
@ -49,6 +50,7 @@ cdef cppclass StateC:
|
|||
|
||||
__init__(const TokenC* sent, int length) nogil:
|
||||
cdef int PADDING = 5
|
||||
this.n_pushes = 0
|
||||
this._buffer = <int*>calloc(length + (PADDING * 2), sizeof(int))
|
||||
this._stack = <int*>calloc(length + (PADDING * 2), sizeof(int))
|
||||
this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
|
||||
|
@ -335,6 +337,7 @@ cdef cppclass StateC:
|
|||
this.set_break(this.B_(0).l_edge)
|
||||
if this._b_i > this._break:
|
||||
this._break = -1
|
||||
this.n_pushes += 1
|
||||
|
||||
void pop() nogil:
|
||||
if this._s_i >= 1:
|
||||
|
@ -351,6 +354,7 @@ cdef cppclass StateC:
|
|||
this._buffer[this._b_i] = this.S(0)
|
||||
this._s_i -= 1
|
||||
this.shifted[this.B(0)] = True
|
||||
this.n_pushes -= 1
|
||||
|
||||
void add_arc(int head, int child, attr_t label) nogil:
|
||||
if this.has_head(child):
|
||||
|
@ -431,6 +435,7 @@ cdef cppclass StateC:
|
|||
this._break = src._break
|
||||
this.offset = src.offset
|
||||
this._empty_token = src._empty_token
|
||||
this.n_pushes = src.n_pushes
|
||||
|
||||
void fast_forward() nogil:
|
||||
# space token attachement policy:
|
||||
|
|
|
@ -36,6 +36,10 @@ cdef class StateClass:
|
|||
hist[i] = self.c.get_hist(i+1)
|
||||
return hist
|
||||
|
||||
@property
|
||||
def n_pushes(self):
|
||||
return self.c.n_pushes
|
||||
|
||||
def is_final(self):
|
||||
return self.c.is_final()
|
||||
|
||||
|
|
|
@ -289,7 +289,14 @@ class Tagger(Pipe):
|
|||
err = Errors.E1006.format(name="Tagger")
|
||||
raise ValueError(err)
|
||||
self.set_output(len(self.labels))
|
||||
self.model.initialize(X=doc_sample)
|
||||
if doc_sample:
|
||||
label_sample = [
|
||||
self.model.ops.alloc2f(len(doc), len(self.labels))
|
||||
for doc in doc_sample
|
||||
]
|
||||
self.model.initialize(X=doc_sample, Y=label_sample)
|
||||
else:
|
||||
self.model.initialize()
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
|
|
@ -279,14 +279,14 @@ cdef class Parser(Pipe):
|
|||
# Chop sequences into lengths of this many transitions, to make the
|
||||
# batch uniform length.
|
||||
# We used to randomize this, but it's not clear that actually helps?
|
||||
cut_size = self.cfg["update_with_oracle_cut_size"]
|
||||
states, golds, max_steps = self._init_gold_batch(
|
||||
max_pushes = self.cfg["update_with_oracle_cut_size"]
|
||||
states, golds, _ = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=cut_size
|
||||
max_length=max_pushes
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
max_steps = max([len(eg.x) for eg in examples])
|
||||
max_pushes = max([len(eg.x) for eg in examples])
|
||||
if not states:
|
||||
return losses
|
||||
all_states = list(states)
|
||||
|
@ -302,7 +302,8 @@ cdef class Parser(Pipe):
|
|||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
||||
states_golds = [(s, g) for (s, g) in zip(states, golds)
|
||||
if s.n_pushes < max_pushes and not s.is_final()]
|
||||
|
||||
backprop_tok2vec(golds)
|
||||
if sgd not in (None, False):
|
||||
|
|
|
@ -84,9 +84,8 @@ def test_overfitting_IO():
|
|||
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
|
||||
fix_random_seed(0)
|
||||
nlp = English()
|
||||
textcat = nlp.add_pipe("textcat")
|
||||
# Set exclusive labels
|
||||
textcat.model.attrs["multi_label"] = False
|
||||
textcat = nlp.add_pipe("textcat", config={"model": {"exclusive_classes": True}})
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -103,9 +102,8 @@ def test_overfitting_IO():
|
|||
test_text = "I am happy."
|
||||
doc = nlp(test_text)
|
||||
cats = doc.cats
|
||||
# note that by default, exclusive_classes = false so we need a bigger error margin
|
||||
assert cats["POSITIVE"] > 0.8
|
||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
assert cats["POSITIVE"] > 0.9
|
||||
assert cats["POSITIVE"] + cats["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
|
@ -113,8 +111,8 @@ def test_overfitting_IO():
|
|||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
cats2 = doc2.cats
|
||||
assert cats2["POSITIVE"] > 0.8
|
||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.1)
|
||||
assert cats2["POSITIVE"] > 0.9
|
||||
assert cats2["POSITIVE"] + cats2["NEGATIVE"] == pytest.approx(1.0, 0.001)
|
||||
|
||||
# Test scoring
|
||||
scores = nlp.evaluate(train_examples, scorer_cfg={"positive_label": "POSITIVE"})
|
||||
|
|
|
@ -3,8 +3,9 @@ title: Layers and Model Architectures
|
|||
teaser: Power spaCy components with custom neural networks
|
||||
menu:
|
||||
- ['Type Signatures', 'type-sigs']
|
||||
- ['Defining Sublayers', 'sublayers']
|
||||
- ['Swapping Architectures', 'swap-architectures']
|
||||
- ['PyTorch & TensorFlow', 'frameworks']
|
||||
- ['Thinc Models', 'thinc']
|
||||
- ['Trainable Components', 'components']
|
||||
next: /usage/projects
|
||||
---
|
||||
|
@ -22,8 +23,6 @@ its model architecture. The architecture is like a recipe for the network, and
|
|||
you can't change the recipe once the dish has already been prepared. You have to
|
||||
make a new one.
|
||||
|
||||
![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
|
||||
|
||||
## Type signatures {#type-sigs}
|
||||
|
||||
<!-- TODO: update example, maybe simplify definition? -->
|
||||
|
@ -92,9 +91,13 @@ code.
|
|||
|
||||
</Infobox>
|
||||
|
||||
## Defining sublayers {#sublayers}
|
||||
## Swapping model architectures {#swap-architectures}
|
||||
|
||||
Model architecture functions often accept **sublayers as arguments**, so that
|
||||
<!-- TODO: textcat example, using different architecture in the config -->
|
||||
|
||||
### Defining sublayers {#sublayers}
|
||||
|
||||
Model architecture functions often accept **sublayers as arguments**, so that
|
||||
you can try **substituting a different layer** into the network. Depending on
|
||||
how the architecture function is structured, you might be able to define your
|
||||
network structure entirely through the [config system](/usage/training#config),
|
||||
|
@ -114,62 +117,37 @@ approaches. And if you want to define your own solution, all you need to do is
|
|||
register a ~~Model[List[Doc], List[Floats2d]]~~ architecture function, and
|
||||
you'll be able to try it out in any of the spaCy components.
|
||||
|
||||
<!-- TODO: example of switching sublayers -->
|
||||
|
||||
### Registering new architectures
|
||||
|
||||
- Recap concept, link to config docs.
|
||||
<!-- TODO: example of swapping sublayers -->
|
||||
|
||||
## Wrapping PyTorch, TensorFlow and other frameworks {#frameworks}
|
||||
|
||||
<!-- TODO: this is copied over from the Thinc docs and we probably want to shorten it and make it more spaCy-specific -->
|
||||
Thinc allows you to [wrap models](https://thinc.ai/docs/usage-frameworks)
|
||||
written in other machine learning frameworks like PyTorch, TensorFlow and MXNet
|
||||
using a unified [`Model`](https://thinc.ai/docs/api-model) API. As well as
|
||||
**wrapping whole models**, Thinc lets you call into an external framework for
|
||||
just **part of your model**: you can have a model where you use PyTorch just for
|
||||
the transformer layers, using "native" Thinc layers to do fiddly input and
|
||||
output transformations and add on task-specific "heads", as efficiency is less
|
||||
of a consideration for those parts of the network.
|
||||
|
||||
Thinc allows you to wrap models written in other machine learning frameworks
|
||||
like PyTorch, TensorFlow and MXNet using a unified
|
||||
[`Model`](https://thinc.ai/docs/api-model) API. As well as **wrapping whole
|
||||
models**, Thinc lets you call into an external framework for just **part of your
|
||||
model**: you can have a model where you use PyTorch just for the transformer
|
||||
layers, using "native" Thinc layers to do fiddly input and output
|
||||
transformations and add on task-specific "heads", as efficiency is less of a
|
||||
consideration for those parts of the network.
|
||||
<!-- TODO: custom tagger implemented in PyTorch, wrapped as Thinc model, link off to project (with notebook?) -->
|
||||
|
||||
Thinc uses a special class, [`Shim`](https://thinc.ai/docs/api-model#shim), to
|
||||
hold references to external objects. This allows each wrapper space to define a
|
||||
custom type, with whatever attributes and methods are helpful, to assist in
|
||||
managing the communication between Thinc and the external library. The
|
||||
[`Model`](https://thinc.ai/docs/api-model#model) class holds `shim` instances in
|
||||
a separate list, and communicates with the shims about updates, serialization,
|
||||
changes of device, etc.
|
||||
## Implementing models in Thinc {#thinc}
|
||||
|
||||
The wrapper will receive each batch of inputs, convert them into a suitable form
|
||||
for the underlying model instance, and pass them over to the shim, which will
|
||||
**manage the actual communication** with the model. The output is then passed
|
||||
back into the wrapper, and converted for use in the rest of the network. The
|
||||
equivalent procedure happens during backpropagation. Array conversion is handled
|
||||
via the [DLPack](https://github.com/dmlc/dlpack) standard wherever possible, so
|
||||
that data can be passed between the frameworks **without copying the data back**
|
||||
to the host device unnecessarily.
|
||||
|
||||
| Framework | Wrapper layer | Shim | DLPack |
|
||||
| -------------- | ------------------------------------------------------------------------- | --------------------------------------------------------- | --------------- |
|
||||
| **PyTorch** | [`PyTorchWrapper`](https://thinc.ai/docs/api-layers#pytorchwrapper) | [`PyTorchShim`](https://thinc.ai/docs/api-model#shims) | ✅ |
|
||||
| **TensorFlow** | [`TensorFlowWrapper`](https://thinc.ai/docs/api-layers#tensorflowwrapper) | [`TensorFlowShim`](https://thinc.ai/docs/api-model#shims) | ❌ <sup>1</sup> |
|
||||
| **MXNet** | [`MXNetWrapper`](https://thinc.ai/docs/api-layers#mxnetwrapper) | [`MXNetShim`](https://thinc.ai/docs/api-model#shims) | ✅ |
|
||||
|
||||
1. DLPack support in TensorFlow is now
|
||||
[available](<(https://github.com/tensorflow/tensorflow/issues/24453)>) but
|
||||
still experimental.
|
||||
|
||||
<!-- TODO:
|
||||
- Explain concept
|
||||
- Link off to notebook
|
||||
-->
|
||||
<!-- TODO: use same example as above, custom tagger, but implemented in Thinc, link off to Thinc docs where appropriate -->
|
||||
|
||||
## Models for trainable components {#components}
|
||||
|
||||
<!-- TODO:
|
||||
|
||||
- Interaction with `predict`, `get_loss` and `set_annotations`
|
||||
- Initialization life-cycle with `begin_training`.
|
||||
- Link to relation extraction notebook.
|
||||
|
||||
Example: relation extraction component (implemented as project template)
|
||||
|
||||
-->
|
||||
|
||||
![Diagram of a pipeline component with its model](../images/layers-architectures.svg)
|
||||
|
||||
```python
|
||||
def update(self, examples):
|
||||
|
|
Loading…
Reference in New Issue
Block a user