Fix pretraining in train script (#6143)

* update pretraining API in train CLI

* bump thinc to 8.0.0a35

* bump to 3.0.0a26

* doc fixes

* small doc fix
This commit is contained in:
Sofie Van Landeghem 2020-09-25 15:47:10 +02:00 committed by GitHub
parent 02a1b6ab83
commit 009ba14aaf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 24 additions and 25 deletions

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a34,<8.0.0a40", "thinc>=8.0.0a35,<8.0.0a40",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a34,<8.0.0a40 thinc>=8.0.0a35,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets==0.2.0a0 ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a34,<8.0.0a40 thinc>=8.0.0a35,<8.0.0a40
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a34,<8.0.0a40 thinc>=8.0.0a35,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a25" __version__ = "3.0.0a26"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -121,20 +121,19 @@ def train(
# Load pretrained tok2vec weights - cf. CLI command 'pretrain' # Load pretrained tok2vec weights - cf. CLI command 'pretrain'
if weights_data is not None: if weights_data is not None:
tok2vec_path = config["pretraining"].get("tok2vec_model", None) tok2vec_component = config["pretraining"]["component"]
if tok2vec_path is None: if tok2vec_component is None:
msg.fail( msg.fail(
f"To pretrained tok2vec weights, the config needs to specify which " f"To use pretrained tok2vec weights, [pretraining.component] "
f"tok2vec layer to load in the setting [pretraining.tok2vec_model].", f"needs to specify the component that should load them.",
exits=1, exits=1,
) )
tok2vec = config layer = nlp.get_pipe(tok2vec_component).model
for subpath in tok2vec_path.split("."): tok2vec_layer = config["pretraining"]["layer"]
tok2vec = tok2vec.get(subpath) if tok2vec_layer:
if not tok2vec: layer = layer.get_ref(tok2vec_layer)
err = f"Could not locate the tok2vec model at {tok2vec_path}" layer.from_bytes(weights_data)
msg.fail(err, exits=1) msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'")
tok2vec.from_bytes(weights_data)
# Create iterator, which yields out info after each optimization step. # Create iterator, which yields out info after each optimization step.
msg.info("Start training") msg.info("Start training")

View File

@ -85,7 +85,7 @@ class Warnings:
"attribute or operator.") "attribute or operator.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W090 = ("Could not locate any binary .spacy files in path '{path}'.") W090 = ("Could not locate any {format} files in path '{path}'.")
W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.")
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
W093 = ("Could not find any data to train the {name} on. Is your " W093 = ("Could not find any data to train the {name} on. Is your "

View File

@ -49,7 +49,7 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
elif path.parts[-1].endswith(file_type): elif path.parts[-1].endswith(file_type):
locs.append(path) locs.append(path)
if len(locs) == 0: if len(locs) == 0:
warnings.warn(Warnings.W090.format(path=orig_path)) warnings.warn(Warnings.W090.format(path=orig_path, format=file_type))
return locs return locs
@ -200,7 +200,7 @@ class JsonlTexts:
DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call
""" """
for loc in walk_corpus(self.path, "jsonl"): for loc in walk_corpus(self.path, ".jsonl"):
records = srsly.read_jsonl(loc) records = srsly.read_jsonl(loc)
for record in records: for record in records:
doc = nlp.make_doc(record["text"]) doc = nlp.make_doc(record["text"])

View File

@ -754,7 +754,7 @@ in the section `[paths]`.
</Infobox> </Infobox>
```cli ```cli
$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides] $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides]
``` ```
| Name | Description | | Name | Description |
@ -778,8 +778,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can
then include a **path to one of these pretrained weights files** in your then include a **path to one of these pretrained weights files** in your
[training config](/usage/training#config) as the `init_tok2vec` setting when you [training config](/usage/training#config) as the `init_tok2vec` setting when you
train your pipeline. This technique may be especially helpful if you have little train your pipeline. This technique may be especially helpful if you have little
labelled data. See the usage docs on [pretraining](/usage/training#pretraining) labelled data. See the usage docs on
for more info. [pretraining](/usage/embeddings-transformers#pretraining) for more info.
<Infobox title="Changed in v3.0" variant="warning"> <Infobox title="Changed in v3.0" variant="warning">
@ -794,7 +794,7 @@ auto-generated by setting `--pretraining` on
</Infobox> </Infobox>
```cli ```cli
$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides] $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides]
``` ```
| Name | Description | | Name | Description |

View File

@ -752,7 +752,7 @@ network to model something about word cooccurrence statistics. Predicting
leading and trailing characters does that more than adequately, as the exact leading and trailing characters does that more than adequately, as the exact
word sequence could be recovered with high accuracy if the initial and trailing word sequence could be recovered with high accuracy if the initial and trailing
characters are predicted accurately. With the vectors objective, the pretraining characters are predicted accurately. With the vectors objective, the pretraining
is use the embedding space learned by an algorithm such as uses the embedding space learned by an algorithm such as
[GloVe](https://nlp.stanford.edu/projects/glove/) or [GloVe](https://nlp.stanford.edu/projects/glove/) or
[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to [Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
focus on the contextual modelling we actual care about. focus on the contextual modelling we actual care about.

View File

@ -175,7 +175,7 @@ sections of a config file are:
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. | | `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
| `training` | Settings and controls for the training and evaluation process. | | `training` | Settings and controls for the training and evaluation process. |
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | | `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |
<Infobox title="Config format and settings" emoji="📖"> <Infobox title="Config format and settings" emoji="📖">