mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
Merge pull request #5518 from svlandeg/fix/pretrain-docs
Pretrain fixes
This commit is contained in:
commit
2a8137aba9
|
@ -187,7 +187,7 @@ def evaluate_textcat(tokenizer, textcat, texts, cats):
|
||||||
width=("Width of CNN layers", "positional", None, int),
|
width=("Width of CNN layers", "positional", None, int),
|
||||||
embed_size=("Embedding rows", "positional", None, int),
|
embed_size=("Embedding rows", "positional", None, int),
|
||||||
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
pretrain_iters=("Number of iterations to pretrain", "option", "pn", int),
|
||||||
train_iters=("Number of iterations to pretrain", "option", "tn", int),
|
train_iters=("Number of iterations to train", "option", "tn", int),
|
||||||
train_examples=("Number of labelled examples", "option", "eg", int),
|
train_examples=("Number of labelled examples", "option", "eg", int),
|
||||||
vectors_model=("Name or path to vectors model to learn from"),
|
vectors_model=("Name or path to vectors model to learn from"),
|
||||||
)
|
)
|
||||||
|
|
|
@ -15,6 +15,7 @@ import random
|
||||||
|
|
||||||
from .._ml import create_default_optimizer
|
from .._ml import create_default_optimizer
|
||||||
from ..util import use_gpu as set_gpu
|
from ..util import use_gpu as set_gpu
|
||||||
|
from ..errors import Errors
|
||||||
from ..gold import GoldCorpus
|
from ..gold import GoldCorpus
|
||||||
from ..compat import path2str
|
from ..compat import path2str
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
|
@ -182,6 +183,7 @@ def train(
|
||||||
msg.warn("Unable to activate GPU: {}".format(use_gpu))
|
msg.warn("Unable to activate GPU: {}".format(use_gpu))
|
||||||
msg.text("Using CPU only")
|
msg.text("Using CPU only")
|
||||||
use_gpu = -1
|
use_gpu = -1
|
||||||
|
base_components = []
|
||||||
if base_model:
|
if base_model:
|
||||||
msg.text("Starting with base model '{}'".format(base_model))
|
msg.text("Starting with base model '{}'".format(base_model))
|
||||||
nlp = util.load_model(base_model)
|
nlp = util.load_model(base_model)
|
||||||
|
@ -227,6 +229,7 @@ def train(
|
||||||
exits=1,
|
exits=1,
|
||||||
)
|
)
|
||||||
msg.text("Extending component from base model '{}'".format(pipe))
|
msg.text("Extending component from base model '{}'".format(pipe))
|
||||||
|
base_components.append(pipe)
|
||||||
disabled_pipes = nlp.disable_pipes(
|
disabled_pipes = nlp.disable_pipes(
|
||||||
[p for p in nlp.pipe_names if p not in pipeline]
|
[p for p in nlp.pipe_names if p not in pipeline]
|
||||||
)
|
)
|
||||||
|
@ -299,7 +302,7 @@ def train(
|
||||||
|
|
||||||
# Load in pretrained weights
|
# Load in pretrained weights
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec, base_components)
|
||||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||||
|
|
||||||
# Verify textcat config
|
# Verify textcat config
|
||||||
|
@ -642,7 +645,7 @@ def _load_vectors(nlp, vectors):
|
||||||
util.load_model(vectors, vocab=nlp.vocab)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc, base_components):
|
||||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
|
@ -651,6 +654,8 @@ def _load_pretrained_tok2vec(nlp, loc):
|
||||||
loaded = []
|
loaded = []
|
||||||
for name, component in nlp.pipeline:
|
for name, component in nlp.pipeline:
|
||||||
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
if hasattr(component, "model") and hasattr(component.model, "tok2vec"):
|
||||||
|
if name in base_components:
|
||||||
|
raise ValueError(Errors.E200.format(component=name))
|
||||||
component.tok2vec.from_bytes(weights_data)
|
component.tok2vec.from_bytes(weights_data)
|
||||||
loaded.append(name)
|
loaded.append(name)
|
||||||
return loaded
|
return loaded
|
||||||
|
|
|
@ -568,6 +568,8 @@ class Errors(object):
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
|
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
||||||
|
"can not be combined with adding a pretrained Tok2Vec layer.")
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -455,7 +455,7 @@ improvement.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
||||||
[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth]
|
[--width] [--conv-depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth]
|
||||||
[--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length]
|
[--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length]
|
||||||
[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every]
|
[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save-every]
|
||||||
[--init-tok2vec] [--epoch-start]
|
[--init-tok2vec] [--epoch-start]
|
||||||
|
@ -467,7 +467,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
||||||
| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. |
|
| `vectors_model` | positional | Name or path to spaCy model with vectors to learn from. |
|
||||||
| `output_dir` | positional | Directory to write models to on each epoch. |
|
| `output_dir` | positional | Directory to write models to on each epoch. |
|
||||||
| `--width`, `-cw` | option | Width of CNN layers. |
|
| `--width`, `-cw` | option | Width of CNN layers. |
|
||||||
| `--depth`, `-cd` | option | Depth of CNN layers. |
|
| `--conv-depth`, `-cd` | option | Depth of CNN layers. |
|
||||||
| `--cnn-window`, `-cW` <Tag variant="new">2.2.2</Tag> | option | Window size for CNN layers. |
|
| `--cnn-window`, `-cW` <Tag variant="new">2.2.2</Tag> | option | Window size for CNN layers. |
|
||||||
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.2</Tag> | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). |
|
| `--cnn-pieces`, `-cP` <Tag variant="new">2.2.2</Tag> | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). |
|
||||||
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. |
|
| `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag> | flag | Whether to use character-based embedding. |
|
||||||
|
@ -542,7 +542,7 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Argument | Type | Description |
|
| Argument | Type | Description |
|
||||||
| ----------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| -------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
| `lang` | positional | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. |
|
||||||
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
| `output_dir` | positional | Model output directory. Will be created if it doesn't exist. |
|
||||||
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
| `--jsonl-loc`, `-j` | option | Optional location of JSONL-formatted [vocabulary file](/api/annotation#vocab-jsonl) with lexical attributes. |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user