mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge pull request #5518 from svlandeg/fix/pretrain-docs
Pretrain fixes
This commit is contained in:
		
						commit
						2a8137aba9
					
				| 
						 | 
					@ -187,7 +187,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
 | 
				
			||||||
    width=("Width of CNN layers", "positional", None, int),
 | 
					    width=("Width of CNN layers", "positional", None, int),
 | 
				
			||||||
    embed_size=("Embedding rows", "positional", None, int),
 | 
					    embed_size=("Embedding rows", "positional", None, int),
 | 
				
			||||||
    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
 | 
					    pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
 | 
				
			||||||
    train_iters=("Number of iterations to pretrain", "option", "tn", int),
 | 
					    train_iters=("Number of iterations to train", "option", "tn", int),
 | 
				
			||||||
    train_examples=("Number of labelled examples", "option", "eg", int),
 | 
					    train_examples=("Number of labelled examples", "option", "eg", int),
 | 
				
			||||||
    vectors_model=("Name or path to vectors model to learn from"),
 | 
					    vectors_model=("Name or path to vectors model to learn from"),
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,6 +15,7 @@ import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .._ml import create_default_optimizer
 | 
					from .._ml import create_default_optimizer
 | 
				
			||||||
from ..util import use_gpu as set_gpu
 | 
					from ..util import use_gpu as set_gpu
 | 
				
			||||||
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..gold import GoldCorpus
 | 
					from ..gold import GoldCorpus
 | 
				
			||||||
from ..compat import path2str
 | 
					from ..compat import path2str
 | 
				
			||||||
from ..lookups import Lookups
 | 
					from ..lookups import Lookups
 | 
				
			||||||
| 
						 | 
					@ -182,6 +183,7 @@ def train(
 | 
				
			||||||
            msg.warn("Unable to activate GPU: {}".format(use_gpu))
 | 
					            msg.warn("Unable to activate GPU: {}".format(use_gpu))
 | 
				
			||||||
            msg.text("Using CPU only")
 | 
					            msg.text("Using CPU only")
 | 
				
			||||||
            use_gpu = -1
 | 
					            use_gpu = -1
 | 
				
			||||||
 | 
					    base_components = []
 | 
				
			||||||
    if base_model:
 | 
					    if base_model:
 | 
				
			||||||
        msg.text("Starting with base model '{}'".format(base_model))
 | 
					        msg.text("Starting with base model '{}'".format(base_model))
 | 
				
			||||||
        nlp = util.load_model(base_model)
 | 
					        nlp = util.load_model(base_model)
 | 
				
			||||||
| 
						 | 
					@ -227,6 +229,7 @@ def train(
 | 
				
			||||||
                            exits=1,
 | 
					                            exits=1,
 | 
				
			||||||
                        )
 | 
					                        )
 | 
				
			||||||
                msg.text("Extending component from base model '{}'".format(pipe))
 | 
					                msg.text("Extending component from base model '{}'".format(pipe))
 | 
				
			||||||
 | 
					                base_components.append(pipe)
 | 
				
			||||||
        disabled_pipes = nlp.disable_pipes(
 | 
					        disabled_pipes = nlp.disable_pipes(
 | 
				
			||||||
            [p for p in nlp.pipe_names if p not in pipeline]
 | 
					            [p for p in nlp.pipe_names if p not in pipeline]
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					@ -299,7 +302,7 @@ def train(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Load in pretrained weights
 | 
					    # Load in pretrained weights
 | 
				
			||||||
    if init_tok2vec is not None:
 | 
					    if init_tok2vec is not None:
 | 
				
			||||||
        components = _load_pretrained_tok2vec(nlp, init_tok2vec)
 | 
					        components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components)
 | 
				
			||||||
        msg.text("Loaded pretrained tok2vec for: {}".format(components))
 | 
					        msg.text("Loaded pretrained tok2vec for: {}".format(components))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Verify textcat config
 | 
					    # Verify textcat config
 | 
				
			||||||
| 
						 | 
					@ -642,7 +645,7 @@ def _load_vectors(nlp, vectors):
 | 
				
			||||||
    util.load_model(vectors, vocab=nlp.vocab)
 | 
					    util.load_model(vectors, vocab=nlp.vocab)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _load_pretrained_tok2vec(nlp, loc):
 | 
					def _load_pretrained_tok2vec(nlp, loc, base_components):
 | 
				
			||||||
    """Load pretrained weights for the 'token-to-vector' part of the component
 | 
					    """Load pretrained weights for the 'token-to-vector' part of the component
 | 
				
			||||||
    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
 | 
					    models, which is typically a CNN. See 'spacy pretrain'. Experimental.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
| 
						 | 
					@ -651,6 +654,8 @@ def _load_pretrained_tok2vec(nlp, loc):
 | 
				
			||||||
    loaded = []
 | 
					    loaded = []
 | 
				
			||||||
    for name, component in nlp.pipeline:
 | 
					    for name, component in nlp.pipeline:
 | 
				
			||||||
        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
 | 
					        if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
 | 
				
			||||||
 | 
					            if name in base_components:
 | 
				
			||||||
 | 
					                raise ValueError(Errors.E200.format(component=name))
 | 
				
			||||||
            component.tok2vec.from_bytes(weights_data)
 | 
					            component.tok2vec.from_bytes(weights_data)
 | 
				
			||||||
            loaded.append(name)
 | 
					            loaded.append(name)
 | 
				
			||||||
    return loaded
 | 
					    return loaded
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -568,6 +568,8 @@ class Errors(object):
 | 
				
			||||||
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
					    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
				
			||||||
            "table, which contains {n_rows} vectors.")
 | 
					            "table, which contains {n_rows} vectors.")
 | 
				
			||||||
    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
					    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
				
			||||||
 | 
					    E200 = ("Specifying a base model with a pretrained component '{component}' "
 | 
				
			||||||
 | 
					            "can not be combined with adding a pretrained Tok2Vec layer.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@add_codes
 | 
					@add_codes
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -455,7 +455,7 @@ improvement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 | 
					$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 | 
				
			||||||
[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth]
 | 
					[--width] [--conv-depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth]
 | 
				
			||||||
[--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length]
 | 
					[--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length]
 | 
				
			||||||
[--min-length]  [--seed] [--n-iter] [--use-vectors] [--n-save-every]
 | 
					[--min-length]  [--seed] [--n-iter] [--use-vectors] [--n-save-every]
 | 
				
			||||||
[--init-tok2vec] [--epoch-start]
 | 
					[--init-tok2vec] [--epoch-start]
 | 
				
			||||||
| 
						 | 
					@ -467,7 +467,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 | 
				
			||||||
| `vectors_model`                                       | positional | Name or path to spaCy model with vectors to learn from.                                                                                                                         |
 | 
					| `vectors_model`                                       | positional | Name or path to spaCy model with vectors to learn from.                                                                                                                         |
 | 
				
			||||||
| `output_dir`                                          | positional | Directory to write models to on each epoch.                                                                                                                                     |
 | 
					| `output_dir`                                          | positional | Directory to write models to on each epoch.                                                                                                                                     |
 | 
				
			||||||
| `--width`, `-cw`                                      | option     | Width of CNN layers.                                                                                                                                                            |
 | 
					| `--width`, `-cw`                                      | option     | Width of CNN layers.                                                                                                                                                            |
 | 
				
			||||||
| `--depth`, `-cd`                                      | option     | Depth of CNN layers.                                                                                                                                                            |
 | 
					| `--conv-depth`, `-cd`                                 | option     | Depth of CNN layers.                                                                                                                                                            |
 | 
				
			||||||
| `--cnn-window`, `-cW` <Tag variant="new">2.2.2</Tag>  | option     | Window size for CNN layers.                                                                                                                                                     |
 | 
					| `--cnn-window`, `-cW` <Tag variant="new">2.2.2</Tag>  | option     | Window size for CNN layers.                                                                                                                                                     |
 | 
				
			||||||
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.2</Tag>  | option     | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish).                                                                                             |
 | 
					| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.2</Tag>  | option     | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish).                                                                                             |
 | 
				
			||||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag>  | flag       | Whether to use character-based embedding.                                                                                                                                       |
 | 
					| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag>  | flag       | Whether to use character-based embedding.                                                                                                                                       |
 | 
				
			||||||
| 
						 | 
					@ -541,16 +541,16 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
 | 
				
			||||||
[--prune-vectors]
 | 
					[--prune-vectors]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Argument                | Type       | Description                                                                                                                                                                                                                                            |
 | 
					| Argument                   | Type       | Description                                                                                                                                                                                                                                            |
 | 
				
			||||||
| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `lang`                  | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                           |
 | 
					| `lang`                     | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`.                                                                                                                                                           |
 | 
				
			||||||
| `output_dir`            | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | 
					| `output_dir`               | positional | Model output directory. Will be created if it doesn't exist.                                                                                                                                                                                           |
 | 
				
			||||||
| `--jsonl-loc`, `-j`     | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                           |
 | 
					| `--jsonl-loc`, `-j`        | option     | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes.                                                                                                                                           |
 | 
				
			||||||
| `--vectors-loc`, `-v`   | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | 
					| `--vectors-loc`, `-v`      | option     | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. |
 | 
				
			||||||
| `--truncate-vectors`, `-t` | option  | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
 | 
					| `--truncate-vectors`, `-t` | option     | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation.                                                                                                                                                      |
 | 
				
			||||||
| `--prune-vectors`, `-V` | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | 
					| `--prune-vectors`, `-V`    | option     | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning.                                                                                                                                                                         |
 | 
				
			||||||
| `--vectors-name`, `-vn` | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | 
					| `--vectors-name`, `-vn`    | option     | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`.                                                                                                                                                                  |
 | 
				
			||||||
| **CREATES**             | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
 | 
					| **CREATES**                | model      | A spaCy model containing the vocab and vectors.                                                                                                                                                                                                        |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Evaluate {#evaluate new="2"}
 | 
					## Evaluate {#evaluate new="2"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user