mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Update pretrain to prevent unintended overwriting of weight fil… (#3902)
* Update pretrain to prevent unintended overwriting of weight files for #3859 * Add '--epoch-start' to pretrain docs * Add mising pretrain arguments to bash example * Update doc tag for v2.1.5
This commit is contained in:
parent
6d577f0b92
commit
04982ccc40
|
@ -5,6 +5,7 @@ import plac
|
||||||
import random
|
import random
|
||||||
import numpy
|
import numpy
|
||||||
import time
|
import time
|
||||||
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from thinc.v2v import Affine, Maxout
|
from thinc.v2v import Affine, Maxout
|
||||||
|
@ -65,6 +66,13 @@ from .train import _load_pretrained_tok2vec
|
||||||
"t2v",
|
"t2v",
|
||||||
Path,
|
Path,
|
||||||
),
|
),
|
||||||
|
epoch_start=(
|
||||||
|
"The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been "
|
||||||
|
"renamed. Prevents unintended overwriting of existing weight files.",
|
||||||
|
"option",
|
||||||
|
"es",
|
||||||
|
int
|
||||||
|
),
|
||||||
)
|
)
|
||||||
def pretrain(
|
def pretrain(
|
||||||
texts_loc,
|
texts_loc,
|
||||||
|
@ -83,6 +91,7 @@ def pretrain(
|
||||||
seed=0,
|
seed=0,
|
||||||
n_save_every=None,
|
n_save_every=None,
|
||||||
init_tok2vec=None,
|
init_tok2vec=None,
|
||||||
|
epoch_start=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
|
@ -151,9 +160,29 @@ def pretrain(
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||||
|
# Parse the epoch number from the given weight file
|
||||||
|
model_name = re.search(r"model\d+\.bin", str(init_tok2vec))
|
||||||
|
if model_name:
|
||||||
|
# Default weight file name so read epoch_start from it by cutting off 'model' and '.bin'
|
||||||
|
epoch_start = int(model_name.group(0)[5:][:-4]) + 1
|
||||||
|
else:
|
||||||
|
if not epoch_start:
|
||||||
|
msg.fail(
|
||||||
|
"You have to use the '--epoch-start' argument when using a renamed weight file for "
|
||||||
|
"'--init-tok2vec'", exits=True
|
||||||
|
)
|
||||||
|
elif epoch_start < 0:
|
||||||
|
msg.fail(
|
||||||
|
"The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start,
|
||||||
|
exits=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Without '--init-tok2vec' the '--epoch-start' argument is ignored
|
||||||
|
epoch_start = 0
|
||||||
|
|
||||||
optimizer = create_default_optimizer(model.ops)
|
optimizer = create_default_optimizer(model.ops)
|
||||||
tracker = ProgressTracker(frequency=10000)
|
tracker = ProgressTracker(frequency=10000)
|
||||||
msg.divider("Pre-training tok2vec layer")
|
msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start)
|
||||||
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
|
||||||
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
|
||||||
|
|
||||||
|
@ -174,7 +203,7 @@ def pretrain(
|
||||||
file_.write(srsly.json_dumps(log) + "\n")
|
file_.write(srsly.json_dumps(log) + "\n")
|
||||||
|
|
||||||
skip_counter = 0
|
skip_counter = 0
|
||||||
for epoch in range(n_iter):
|
for epoch in range(epoch_start, n_iter + epoch_start):
|
||||||
for batch_id, batch in enumerate(
|
for batch_id, batch in enumerate(
|
||||||
util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
|
util.minibatch_by_words(((text, None) for text in texts), size=batch_size)
|
||||||
):
|
):
|
||||||
|
|
|
@ -284,9 +284,9 @@ same between pretraining and training. The API and errors around this need some
|
||||||
improvement.
|
improvement.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
|
$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
||||||
[--depth] [--embed-rows] [--loss_func] [--dropout] [--seed] [--n-iter] [--use-vectors]
|
[--width] [--depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length]
|
||||||
[--n-save_every]
|
[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
|
@ -306,7 +306,8 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width]
|
||||||
| `--n-iter`, `-i` | option | Number of iterations to pretrain. |
|
| `--n-iter`, `-i` | option | Number of iterations to pretrain. |
|
||||||
| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. |
|
| `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. |
|
||||||
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
||||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.|
|
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.|
|
||||||
|
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.|
|
||||||
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
|
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
### JSONL format for raw text {#pretrain-jsonl}
|
### JSONL format for raw text {#pretrain-jsonl}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user