From 009ba14aafff1769bff408b2069e69245c441d2b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 25 Sep 2020 15:47:10 +0200 Subject: [PATCH] Fix pretraining in train script (#6143) * update pretraining API in train CLI * bump thinc to 8.0.0a35 * bump to 3.0.0a26 * doc fixes * small doc fix --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 ++-- spacy/about.py | 2 +- spacy/cli/train.py | 21 +++++++++---------- spacy/errors.py | 2 +- spacy/training/corpus.py | 4 ++-- website/docs/api/cli.md | 8 +++---- website/docs/usage/embeddings-transformers.md | 2 +- website/docs/usage/training.md | 2 +- 10 files changed, 24 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5290660aa..14d2c1e8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.0.0a34,<8.0.0a40", + "thinc>=8.0.0a35,<8.0.0a40", "blis>=0.4.0,<0.5.0", "pytokenizations", "pathy" diff --git a/requirements.txt b/requirements.txt index a8b237aa1..b3a95dcff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.0.0a34,<8.0.0a40 +thinc>=8.0.0a35,<8.0.0a40 blis>=0.4.0,<0.5.0 ml_datasets==0.2.0a0 murmurhash>=0.28.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 9831402d1..b080d4330 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,13 +34,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.0.0a34,<8.0.0a40 + thinc>=8.0.0a35,<8.0.0a40 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.0.0a34,<8.0.0a40 + thinc>=8.0.0a35,<8.0.0a40 blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 diff --git a/spacy/about.py b/spacy/about.py index ea9f9f33e..fbe772d25 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy-nightly" -__version__ = "3.0.0a25" +__version__ = "3.0.0a26" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6d61c2425..cbb0655ef 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -121,20 +121,19 @@ def train( # Load pretrained tok2vec weights - cf. CLI command 'pretrain' if weights_data is not None: - tok2vec_path = config["pretraining"].get("tok2vec_model", None) - if tok2vec_path is None: + tok2vec_component = config["pretraining"]["component"] + if tok2vec_component is None: msg.fail( - f"To pretrained tok2vec weights, the config needs to specify which " - f"tok2vec layer to load in the setting [pretraining.tok2vec_model].", + f"To use pretrained tok2vec weights, [pretraining.component] " + f"needs to specify the component that should load them.", exits=1, ) - tok2vec = config - for subpath in tok2vec_path.split("."): - tok2vec = tok2vec.get(subpath) - if not tok2vec: - err = f"Could not locate the tok2vec model at {tok2vec_path}" - msg.fail(err, exits=1) - tok2vec.from_bytes(weights_data) + layer = nlp.get_pipe(tok2vec_component).model + tok2vec_layer = config["pretraining"]["layer"] + if tok2vec_layer: + layer = layer.get_ref(tok2vec_layer) + layer.from_bytes(weights_data) + msg.info(f"Loaded pretrained weights into component '{tok2vec_component}'") # Create iterator, which yields out info after each optimization step. msg.info("Start training") diff --git a/spacy/errors.py b/spacy/errors.py index 4216e3936..640419182 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -85,7 +85,7 @@ class Warnings: "attribute or operator.") # TODO: fix numbering after merging develop into master - W090 = ("Could not locate any binary .spacy files in path '{path}'.") + W090 = ("Could not locate any {format} files in path '{path}'.") W091 = ("Could not clean/remove the temp directory at {dir}: {msg}.") W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.") W093 = ("Could not find any data to train the {name} on. Is your " diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 11f098993..848692f47 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -49,7 +49,7 @@ def walk_corpus(path: Union[str, Path], file_type) -> List[Path]: elif path.parts[-1].endswith(file_type): locs.append(path) if len(locs) == 0: - warnings.warn(Warnings.W090.format(path=orig_path)) + warnings.warn(Warnings.W090.format(path=orig_path, format=file_type)) return locs @@ -200,7 +200,7 @@ class JsonlTexts: DOCS: https://nightly.spacy.io/api/corpus#jsonltexts-call """ - for loc in walk_corpus(self.path, "jsonl"): + for loc in walk_corpus(self.path, ".jsonl"): records = srsly.read_jsonl(loc) for record in records: doc = nlp.make_doc(record["text"]) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 53cd954be..a6cb41e5e 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -754,7 +754,7 @@ in the section `[paths]`. ```cli -$ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides] +$ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id] [overrides] ``` | Name | Description | @@ -778,8 +778,8 @@ pretrained ones. The weights are saved to a directory after each epoch. You can then include a **path to one of these pretrained weights files** in your [training config](/usage/training#config) as the `init_tok2vec` setting when you train your pipeline. This technique may be especially helpful if you have little -labelled data. See the usage docs on [pretraining](/usage/training#pretraining) -for more info. +labelled data. See the usage docs on +[pretraining](/usage/embeddings-transformers#pretraining) for more info. @@ -794,7 +794,7 @@ auto-generated by setting `--pretraining` on ```cli -$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [overrides] +$ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [--epoch-resume] [--gpu-id] [overrides] ``` | Name | Description | diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index b00760e62..97249bfb2 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -752,7 +752,7 @@ network to model something about word cooccurrence statistics. Predicting leading and trailing characters does that more than adequately, as the exact word sequence could be recovered with high accuracy if the initial and trailing characters are predicted accurately. With the vectors objective, the pretraining -is use the embedding space learned by an algorithm such as +uses the embedding space learned by an algorithm such as [GloVe](https://nlp.stanford.edu/projects/glove/) or [Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to focus on the contextual modelling we actual care about. diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 65afd0eb4..54be6b367 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -175,7 +175,7 @@ sections of a config file are: | `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. | | `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. | | `training` | Settings and controls for the training and evaluation process. | -| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). | +| `pretraining` | Optional settings and controls for the [language model pretraining](/usage/embeddings-transformers#pretraining). |