From 5f0a91cf3771a96e6bcd0c63a9d70e3fc74020d1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 May 2020 09:56:29 +0200 Subject: [PATCH 1/3] fix conv-depth parameter --- website/docs/api/cli.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 505977be9..b49a2fb08 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -455,7 +455,7 @@ improvement. ```bash $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] -[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] +[--width] [--conv-depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every] [--init-tok2vec] [--epoch-start] @@ -467,7 +467,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] | `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. | | `output_dir` | positional | Directory to write models to on each epoch. | | `--width`, `-cw` | option | Width of CNN layers. | -| `--depth`, `-cd` | option | Depth of CNN layers. | +| `--conv-depth`, `-cd` | option | Depth of CNN layers. | | `--cnn-window`, `-cW` 2.2.2 | option | Window size for CNN layers. | | `--cnn-pieces`, `-cP` 2.2.2 | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). | | `--use-chars`, `-chr` 2.2.2 | flag | Whether to use character-based embedding. | @@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] ``` -| Argument | Type | Description | -| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | -| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | -| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | -| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | -| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | -| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | -| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | -| **CREATES** | model | A spaCy model containing the vocab and vectors. | +| Argument | Type | Description | +| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. | +| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. | +| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. | +| `--vectors-loc`, `-v` | option | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. | +| `--truncate-vectors`, `-t` | option | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. | +| `--prune-vectors`, `-V` | option | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. | +| `--vectors-name`, `-vn` | option | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. | +| **CREATES** | model | A spaCy model containing the vocab and vectors. | ## Evaluate {#evaluate new="2"} From 04ba37b667764c5b18825a5ee8ce513962e73bcd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 May 2020 13:52:39 +0200 Subject: [PATCH 2/3] fix description --- examples/training/pretrain_textcat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/pretrain_textcat.py b/examples/training/pretrain_textcat.py index f3e493f6a..d29e20ad1 100644 --- a/examples/training/pretrain_textcat.py +++ b/examples/training/pretrain_textcat.py @@ -187,7 +187,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats): width=("Width of CNN layers", "positional", None, int), embed_size=("Embedding rows", "positional", None, int), pretrain_iters=("Number of iterations to pretrain", "option", "pn", int), - train_iters=("Number of iterations to pretrain", "option", "tn", int), + train_iters=("Number of iterations to train", "option", "tn", int), train_examples=("Number of labelled examples", "option", "eg", int), vectors_model=("Name or path to vectors model to learn from"), ) From 291483157dacfc80ecd6ba2f7e097fbe98a4395a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 29 May 2020 17:38:33 +0200 Subject: [PATCH 3/3] prevent loading a pretrained Tok2Vec layer AND pretrained components --- spacy/cli/train.py | 9 +++++++-- spacy/errors.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6ce095c15..d4de9aeb4 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -15,6 +15,7 @@ import random from .._ml import create_default_optimizer from ..util import use_gpu as set_gpu +from ..errors import Errors from ..gold import GoldCorpus from ..compat import path2str from ..lookups import Lookups @@ -182,6 +183,7 @@ def train( msg.warn("Unable to activate GPU: {}".format(use_gpu)) msg.text("Using CPU only") use_gpu = -1 + base_components = [] if base_model: msg.text("Starting with base model '{}'".format(base_model)) nlp = util.load_model(base_model) @@ -227,6 +229,7 @@ def train( exits=1, ) msg.text("Extending component from base model '{}'".format(pipe)) + base_components.append(pipe) disabled_pipes = nlp.disable_pipes( [p for p in nlp.pipe_names if p not in pipeline] ) @@ -299,7 +302,7 @@ def train( # Load in pretrained weights if init_tok2vec is not None: - components = _load_pretrained_tok2vec(nlp, init_tok2vec) + components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components) msg.text("Loaded pretrained tok2vec for: {}".format(components)) # Verify textcat config @@ -642,7 +645,7 @@ def _load_vectors(nlp, vectors): util.load_model(vectors, vocab=nlp.vocab) -def _load_pretrained_tok2vec(nlp, loc): +def _load_pretrained_tok2vec(nlp, loc, base_components): """Load pretrained weights for the 'token-to-vector' part of the component models, which is typically a CNN. See 'spacy pretrain'. Experimental. """ @@ -651,6 +654,8 @@ def _load_pretrained_tok2vec(nlp, loc): loaded = [] for name, component in nlp.pipeline: if hasattr(component, "model") and hasattr(component.model, "tok2vec"): + if name in base_components: + raise ValueError(Errors.E200.format(component=name)) component.tok2vec.from_bytes(weights_data) loaded.append(name) return loaded diff --git a/spacy/errors.py b/spacy/errors.py index 6d92545d7..11b601e19 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -568,6 +568,8 @@ class Errors(object): E198 = ("Unable to return {n} most similar vectors for the current vectors " "table, which contains {n_rows} vectors.") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") + E200 = ("Specifying a base model with a pretrained component '{component}' " + "can not be combined with adding a pretrained Tok2Vec layer.") @add_codes