mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Fix pretraining in train script (#6143)
* update pretraining API in train CLI * bump thinc to 8.0.0a35 * bump to 3.0.0a26 * doc fixes * small doc fix
This commit is contained in:
parent
02a1b6ab83
commit
009ba14aaf
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a34,<8.0.0a40",
|
||||
"thinc>=8.0.0a35,<8.0.0a40",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"pathy"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
thinc>=8.0.0a35,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets==0.2.0a0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
thinc>=8.0.0a35,<8.0.0a40
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a34,<8.0.0a40
|
||||
thinc>=8.0.0a35,<8.0.0a40
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.8.0,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a25"
|
||||
__version__ = "3.0.0a26"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -121,20 +121,19 @@ def train(
|
|||
|
||||
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
||||
if weights_data is not None:
|
||||
tok2vec_path = config["pretraining"].get("tok2vec_model", None)
|
||||
if tok2vec_path is None:
|
||||
tok2vec_component = config["pretraining"]["component"]
|
||||
if tok2vec_component is None:
|
||||
msg.fail(
|
||||
f"To pretrained tok2vec weights, the config needs to specify which "
|
||||
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].",
|
||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||
f"needs to specify the component that should load them.",
|
||||
exits=1,
|
||||
)
|
||||
tok2vec = config
|
||||
for subpath in tok2vec_path.split("."):
|
||||
tok2vec = tok2vec.get(subpath)
|
||||
if not tok2vec:
|
||||
err = f"Could not locate the tok2vec model at {tok2vec_path}"
|
||||
msg.fail(err, exits=1)
|
||||
tok2vec.from_bytes(weights_data)
|
||||
layer = nlp.get_pipe(tok2vec_component).model
|
||||
tok2vec_layer = config["pretraining"]["layer"]
|
||||
if tok2vec_layer:
|
||||
layer = layer.get_ref(tok2vec_layer)
|
||||
layer.from_bytes(weights_data)
|
||||
msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
|
||||
|
||||
# Create iterator, which yields out info after each optimization step.
|
||||
msg.info("Start training")
|
||||
|
|
|
@ -85,7 +85,7 @@ class Warnings:
|
|||
"attribute or operator.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W090 = ("Could not locate any binary .spacy files in path '{path}'.")
|
||||
W090 = ("Could not locate any {format} files in path '{path}'.")
|
||||
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
|
||||
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||
W093 = ("Could not find any data to train the {name} on. Is your "
|
||||
|
|
|
@ -49,7 +49,7 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
|
|||
elif path.parts[-1].endswith(file_type):
|
||||
locs.append(path)
|
||||
if len(locs) == 0:
|
||||
warnings.warn(Warnings.W090.format(path=orig_path))
|
||||
warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
|
||||
return locs
|
||||
|
||||
|
||||
|
@ -200,7 +200,7 @@ class JsonlTexts:
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
|
||||
"""
|
||||
for loc in walk_corpus(self.path, "jsonl"):
|
||||
for loc in walk_corpus(self.path, ".jsonl"):
|
||||
records = srsly.read_jsonl(loc)
|
||||
for record in records:
|
||||
doc = nlp.make_doc(record["text"])
|
||||
|
|
|
@ -754,7 +754,7 @@ in the section `[paths]`.
|
|||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides]
|
||||
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
|
@ -778,8 +778,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
|
|||
then include a **path to one of these pretrained weights files** in your
|
||||
[training config](/usage/training#config) as the `init_tok2vec` setting when you
|
||||
train your pipeline. This technique may be especially helpful if you have little
|
||||
labelled data. See the usage docs on [pretraining](/usage/training#pretraining)
|
||||
for more info.
|
||||
labelled data. See the usage docs on
|
||||
[pretraining](/usage/embeddings-transformers#pretraining) for more info.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -794,7 +794,7 @@ auto-generated by setting `--pretraining` on
|
|||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides]
|
||||
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
|
|
|
@ -752,7 +752,7 @@ network to model something about word cooccurrence statistics. Predicting
|
|||
leading and trailing characters does that more than adequately, as the exact
|
||||
word sequence could be recovered with high accuracy if the initial and trailing
|
||||
characters are predicted accurately. With the vectors objective, the pretraining
|
||||
is use the embedding space learned by an algorithm such as
|
||||
uses the embedding space learned by an algorithm such as
|
||||
[GloVe](https://nlp.stanford.edu/projects/glove/) or
|
||||
[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
|
||||
focus on the contextual modelling we actual care about.
|
||||
|
|
|
@ -175,7 +175,7 @@ sections of a config file are:
|
|||
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `training` | Settings and controls for the training and evaluation process. |
|
||||
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
|
||||
| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
|
||||
|
||||
<Infobox title="Config format and settings" emoji="📖">
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user