Merge remote-tracking branch 'upstream/develop' into feature/doc-ents-v3-2

This commit is contained in:
Adriane Boyd 2020-09-21 14:42:04 +02:00
commit 13fbf6556a
62 changed files with 881 additions and 1028 deletions

View File

@ -1,133 +0,0 @@
[paths]
train = ""
dev = ""
raw = null
init_tok2vec = null
[system]
seed = 0
use_pytorch_for_gpu_memory = false
[training]
seed = ${system:seed}
dropout = 0.1
init_tok2vec = ${paths:init_tok2vec}
vectors = null
accumulate_gradient = 1
max_steps = 0
max_epochs = 0
patience = 10000
eval_frequency = 200
score_weights = {"dep_las": 0.4, "ents_f": 0.4, "tag_acc": 0.2}
frozen_components = []
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
gold_preproc = true
max_length = 0
limit = 0
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
gold_preproc = ${training.read_train:gold_preproc}
max_length = 0
limit = 0
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 1e-8
learn_rate = 0.001
[nlp]
lang = "en"
load_vocab_data = false
pipeline = ["tok2vec", "ner", "tagger", "parser"]
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[components]
[components.tok2vec]
factory = "tok2vec"
[components.ner]
factory = "ner"
learn_tokens = false
min_action_freq = 1
[components.tagger]
factory = "tagger"
[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 2
use_upper = true
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 2
use_upper = true
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
rows = 2000
also_embed_subwords = true
also_use_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

View File

@ -1,152 +0,0 @@
# Training hyper-parameters and additional features.
[training]
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length or number of examples.
max_length = 0
limit = 0
# Data augmentation
orth_variant_level = 0.0
dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 400
# Other settings
seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
# These settings are invalid for the transformer models.
init_tok2vec = null
discard_oversize = false
omit_extra_lookups = false
batch_by = "words"
use_gpu = -1
raw_text = null
tag_map = null
[training.batch_size]
@schedules = "compounding.v1"
start = 1000
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[pretraining]
max_epochs = 1000
min_length = 5
max_length = 500
dropout = 0.2
n_save_every = null
batch_size = 3000
seed = ${training:seed}
use_pytorch_for_gpu_memory = ${training:use_pytorch_for_gpu_memory}
tok2vec_model = "nlp.pipeline.tok2vec.model"
[pretraining.objective]
type = "characters"
n_characters = 4
[pretraining.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[nlp]
lang = "en"
vectors = null
base_model = null
[nlp.pipeline]
[nlp.pipeline.tok2vec]
factory = "tok2vec"
[nlp.pipeline.senter]
factory = "senter"
[nlp.pipeline.ner]
factory = "ner"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger]
factory = "tagger"
[nlp.pipeline.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.senter.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.senter.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tagger.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.parser.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors}
width = 256
depth = 6
window_size = 1
embed_size = 10000
maxout_pieces = 3
subword_features = true
dropout = null

View File

@ -1,73 +0,0 @@
# Training hyper-parameters and additional features.
[training]
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length or number of examples.
max_length = 3000
limit = 0
# Data augmentation
orth_variant_level = 0.0
dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 100000
max_epochs = 0
max_steps = 0
eval_frequency = 1000
# Other settings
seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "ents_p", "ents_r", "ents_f"]
score_weights = {"ents_f": 1.0}
# These settings are invalid for the transformer models.
init_tok2vec = null
discard_oversize = false
omit_extra_lookups = false
batch_by = "words"
[training.batch_size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[nlp]
lang = "en"
vectors = null
[nlp.pipeline.ner]
factory = "ner"
learn_tokens = false
min_action_freq = 1
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 64
maxout_pieces = 2
use_upper = true
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors}
width = 96
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
dropout = ${training:dropout}

View File

@ -1,73 +0,0 @@
[training]
patience = 10000
eval_frequency = 200
dropout = 0.2
init_tok2vec = null
vectors = null
max_epochs = 100
orth_variant_level = 0.0
gold_preproc = true
max_length = 0
use_gpu = 0
scores = ["tags_acc", "uas", "las"]
score_weights = {"las": 0.8, "tags_acc": 0.2}
limit = 0
seed = 0
accumulate_gradient = 2
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
beta2 = 0.999
[nlp]
lang = "en"
vectors = ${training:vectors}
[nlp.pipeline.tok2vec]
factory = "tok2vec"
[nlp.pipeline.tagger]
factory = "tagger"
[nlp.pipeline.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 1
beam_width = 1
beam_update_prob = 1.0
[nlp.pipeline.tagger.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 64
maxout_pieces = 3
[nlp.pipeline.parser.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
@architectures = "spacy.HashEmbedBiLSTM.v1"
pretrained_vectors = ${nlp:vectors}
width = 96
depth = 4
embed_size = 2000
subword_features = true
maxout_pieces = 3
dropout = null

View File

@ -1,110 +0,0 @@
[paths]
train = ""
dev = ""
raw = null
init_tok2vec = null
[system]
seed = 0
use_pytorch_for_gpu_memory = false
[training]
seed = ${system:seed}
dropout = 0.2
init_tok2vec = ${paths:init_tok2vec}
vectors = null
accumulate_gradient = 1
max_steps = 0
max_epochs = 0
patience = 10000
eval_frequency = 200
score_weights = {"dep_las": 0.8, "tag_acc": 0.2}
[training.read_train]
@readers = "spacy.Corpus.v1"
path = ${paths:train}
gold_preproc = true
max_length = 0
limit = 0
[training.read_dev]
@readers = "spacy.Corpus.v1"
path = ${paths:dev}
gold_preproc = ${training.read_train:gold_preproc}
max_length = 0
limit = 0
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
beta2 = 0.999
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger", "parser"]
load_vocab_data = false
[nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1"
[nlp.lemmatizer]
@lemmatizers = "spacy.Lemmatizer.v1"
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tagger]
factory = "tagger"
[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 1
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 64
maxout_pieces = 3
[components.parser.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode:width}
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v1"
width = ${components.tok2vec.model.encode:width}
rows = 2000
also_embed_subwords = true
also_use_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3

View File

@ -1,69 +0,0 @@
[training]
use_gpu = -1
limit = 0
dropout = 0.2
patience = 10000
eval_frequency = 200
scores = ["ents_f"]
score_weights = {"ents_f": 1}
orth_variant_level = 0.0
gold_preproc = true
max_length = 0
batch_size = 25
seed = 0
accumulate_gradient = 2
discard_oversize = false
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
beta2 = 0.999
[nlp]
lang = "en"
vectors = null
[nlp.pipeline.tok2vec]
factory = "tok2vec"
[nlp.pipeline.tok2vec.model]
@architectures = "spacy.Tok2Vec.v1"
[nlp.pipeline.tok2vec.model.extract]
@architectures = "spacy.CharacterEmbed.v1"
width = 96
nM = 64
nC = 8
rows = 2000
columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
dropout = null
[nlp.pipeline.tok2vec.model.extract.features]
@architectures = "spacy.Doc2Feats.v1"
columns = ${nlp.pipeline.tok2vec.model.extract:columns}
[nlp.pipeline.tok2vec.model.embed]
@architectures = "spacy.LayerNormalizedMaxout.v1"
width = ${nlp.pipeline.tok2vec.model.extract:width}
maxout_pieces = 4
[nlp.pipeline.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v1"
width = ${nlp.pipeline.tok2vec.model.extract:width}
window_size = 1
maxout_pieces = 2
depth = 2
[nlp.pipeline.ner]
factory = "ner"
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model.extract:width}

View File

@ -1,51 +0,0 @@
[training]
use_gpu = -1
limit = 0
dropout = 0.2
patience = 10000
eval_frequency = 200
scores = ["ents_p", "ents_r", "ents_f"]
score_weights = {"ents_f": 1}
orth_variant_level = 0.0
gold_preproc = true
max_length = 0
seed = 0
accumulate_gradient = 2
discard_oversize = false
[training.batch_size]
@schedules = "compounding.v1"
start = 3000
stop = 3000
compound = 1.001
[training.optimizer]
@optimizers = "Adam.v1"
learn_rate = 0.001
beta1 = 0.9
beta2 = 0.999
[nlp]
lang = "en"
vectors = null
[nlp.pipeline.ner]
factory = "ner"
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
width = 128
depth = 4
embed_size = 7000
maxout_pieces = 3
window_size = 1
subword_features = true
pretrained_vectors = null
dropout = null

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a31,<8.0.0a40",
"thinc>=8.0.0a34,<8.0.0a40",
"blis>=0.4.0,<0.5.0",
"pytokenizations",
"pathy"

View File

@ -1,9 +1,9 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a31,<8.0.0a40
thinc>=8.0.0a34,<8.0.0a40
blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1
ml_datasets==0.2.0a0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a31,<8.0.0a40
thinc>=8.0.0a34,<8.0.0a40
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0a31,<8.0.0a40
thinc>=8.0.0a34,<8.0.0a40
blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0

View File

@ -6,7 +6,6 @@ from wasabi import msg
import srsly
import hashlib
import typer
import subprocess
from click import NoSuchOption
from typer.main import get_command
from contextlib import contextmanager
@ -308,6 +307,31 @@ def git_checkout(
msg.fail("Destination of checkout must not exist", exits=1)
if not dest.parent.exists():
raise IOError("Parent of destination of checkout must exist")
if sparse and git_version >= (2, 22):
return git_sparse_checkout(repo, subpath, dest, branch)
elif sparse:
# Only show warnings if the user explicitly wants sparse checkout but
# the Git version doesn't support it
err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet."
)
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
msg.warn(
f"{err_unk if git_version == (0, 0) else err_old} "
f"This means that more files than necessary may be downloaded "
f"temporarily. To only download the files needed, make sure "
f"you're using Git v2.22 or above."
)
with make_tempdir() as tmp_dir:
cmd = f"git -C {tmp_dir} clone {repo} . -b {branch}"
run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
shutil.copytree(str(tmp_dir / Path(subpath)), str(dest))
def git_sparse_checkout(repo, subpath, dest, branch):
# We're using Git, partial clone and sparse checkout to
# only clone the files we need
# This ends up being RIDICULOUS. omg.
@ -324,47 +348,31 @@ def git_checkout(
# *that* we can do by path.
# We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir:
supports_sparse = git_version >= (2, 22)
use_sparse = supports_sparse and sparse
# This is the "clone, but don't download anything" part.
cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
if use_sparse:
cmd += f"--filter=blob:none" # <-- The key bit
# Only show warnings if the user explicitly wants sparse checkout but
# the Git version doesn't support it
elif sparse:
err_old = (
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet."
)
err_unk = "You're running an unknown version of Git, so sparse checkout has been disabled."
msg.warn(
f"{err_unk if git_version == (0, 0) else err_old} "
f"This means that more files than necessary may be downloaded "
f"temporarily. To only download the files needed, make sure "
f"you're using Git v2.22 or above."
)
try_run_command(cmd)
cmd = (
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 "
f"-b {branch} --filter=blob:none"
)
run_command(cmd)
# Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if use_sparse else ''} -- {subpath}"
ret = try_run_command(cmd)
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
ret = run_command(cmd, capture=True)
git_repo = _from_http_to_git(repo)
# Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
if use_sparse and not missings:
if not missings:
err = (
f"Could not find any relevant files for '{subpath}'. "
f"Did you specify a correct and complete path within repo '{repo}' "
f"and branch {branch}?"
)
msg.fail(err, exits=1)
if use_sparse:
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
try_run_command(cmd)
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
run_command(cmd, capture=True)
# And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
try_run_command(cmd)
run_command(cmd, capture=True)
# We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest))
@ -378,7 +386,7 @@ def get_git_version(
RETURNS (Tuple[int, int]): The version as a (major, minor) tuple. Returns
(0, 0) if the version couldn't be determined.
"""
ret = try_run_command(["git", "--version"], error=error)
ret = run_command("git --version", capture=True)
stdout = ret.stdout.strip()
if not stdout or not stdout.startswith("git version"):
return (0, 0)
@ -386,23 +394,6 @@ def get_git_version(
return (int(version[0]), int(version[1]))
def try_run_command(
cmd: Union[str, List[str]], error: str = "Could not run command"
) -> subprocess.CompletedProcess:
"""Try running a command and raise an error if it fails.
cmd (Union[str, List[str]]): The command to run.
error (str): The error message.
RETURNS (CompletedProcess): The completed process if the command ran.
"""
try:
return run_command(cmd, capture=True)
except subprocess.CalledProcessError as e:
msg.fail(error)
print(cmd)
sys.exit(1)
def _from_http_to_git(repo: str) -> str:
if repo.startswith("http://"):
repo = repo.replace(r"http://", r"https://")

View File

@ -2,7 +2,7 @@ from typing import Dict, Any, Optional
from pathlib import Path
from wasabi import msg
from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation
from thinc.api import Model, data_validation, set_gpu_allocator
import typer
from ._util import Arg, Opt, debug_cli, show_validation_error
@ -53,7 +53,12 @@ def debug_model_cli(
}
config_overrides = parse_config_overrides(ctx.args)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=config_overrides)
config = util.load_config(
config_path, overrides=config_overrides, interpolate=True
)
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
nlp, config = util.load_model_from_config(config_path)
seed = config["training"]["seed"]
if seed is not None:

View File

@ -30,6 +30,7 @@ def init_config_cli(
pipeline: Optional[str] = Opt("tagger,parser,ner", "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(Optimizations.efficiency.value, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
cpu: bool = Opt(False, "--cpu", "-C", help="Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
# fmt: on
):
"""
@ -43,7 +44,14 @@ def init_config_cli(
if isinstance(optimize, Optimizations): # instance of enum from the CLI
optimize = optimize.value
pipeline = string_to_list(pipeline)
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)
init_config(
output_file,
lang=lang,
pipeline=pipeline,
optimize=optimize,
cpu=cpu,
pretraining=pretraining,
)
@init_cli.command("fill-config")
@ -51,7 +59,7 @@ def init_fill_config_cli(
# fmt: off
base_path: Path = Arg(..., help="Base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="File to save config.cfg to (or - for stdout)", allow_dash=True),
pretraining: bool = Opt(False, "--pretraining", "-p", help="Include config for pretraining (with 'spacy pretrain')"),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes")
# fmt: on
):
@ -109,7 +117,13 @@ def fill_config(
def init_config(
output_file: Path, *, lang: str, pipeline: List[str], optimize: str, cpu: bool
output_file: Path,
*,
lang: str,
pipeline: List[str],
optimize: str,
cpu: bool,
pretraining: bool = False,
) -> None:
is_stdout = str(output_file) == "-"
msg = Printer(no_print=is_stdout)
@ -156,8 +170,13 @@ def init_config(
with show_validation_error(hint_fill=False):
config = util.load_config_from_str(base_template)
nlp, _ = util.load_model_from_config(config, auto_fill=True)
config = nlp.config
if pretraining:
validate_config_for_pretrain(config, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
config = pretrain_config.merge(config)
msg.good("Auto-filled config with all values")
save_config(nlp.config, output_file, is_stdout=is_stdout)
save_config(config, output_file, is_stdout=is_stdout)
def save_config(

View File

@ -110,7 +110,7 @@ def package(
msg.good(f"Successfully created package '{model_name_v}'", main_path)
if create_sdist:
with util.working_dir(main_path):
util.run_command([sys.executable, "setup.py", "sdist"])
util.run_command([sys.executable, "setup.py", "sdist"], capture=False)
zip_file = main_path / "dist" / f"{model_name_v}.tar.gz"
msg.good(f"Successfully created zipped Python package", zip_file)

View File

@ -4,10 +4,9 @@ import time
import re
from collections import Counter
from pathlib import Path
from thinc.api import Config
from thinc.api import use_pytorch_for_gpu_memory, require_gpu
from thinc.api import require_gpu, set_gpu_allocator
from thinc.api import set_dropout_rate, to_categorical, fix_random_seed
from thinc.api import CosineDistance, L2Distance
from thinc.api import Config, CosineDistance, L2Distance
from wasabi import msg
import srsly
from functools import partial
@ -20,6 +19,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..tokens import Doc
from ..attrs import ID
from .. import util
from ..util import dot_to_object
@app.command(
@ -31,7 +31,7 @@ def pretrain_cli(
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
epoch_resume: Optional[int] = Opt(None, "--epoch-resume", "-er", help="The epoch to resume counting from when using --resume-path. Prevents unintended overwriting of existing weight files."),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
@ -70,9 +70,7 @@ def pretrain_cli(
with show_validation_error(config_path):
config = util.load_config(
config_path,
overrides=config_overrides,
interpolate=True
config_path, overrides=config_overrides, interpolate=True
)
if not config.get("pretraining"):
# TODO: What's the solution here? How do we handle optional blocks?
@ -83,7 +81,7 @@ def pretrain_cli(
config.to_disk(output_dir / "config.cfg")
msg.good("Saved config file in the output directory")
pretrain(
config,
output_dir,
@ -98,15 +96,17 @@ def pretrain(
output_dir: Path,
resume_path: Optional[Path] = None,
epoch_resume: Optional[int] = None,
use_gpu: int=-1
use_gpu: int = -1,
):
if config["system"].get("seed") is not None:
fix_random_seed(config["system"]["seed"])
if use_gpu >= 0 and config["system"].get("use_pytorch_for_gpu_memory"):
use_pytorch_for_gpu_memory()
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
nlp, config = util.load_model_from_config(config)
P_cfg = config["pretraining"]
corpus = P_cfg["corpus"]
corpus = dot_to_object(config, P_cfg["corpus"])
batcher = P_cfg["batcher"]
model = create_pretraining_model(nlp, config["pretraining"])
optimizer = config["pretraining"]["optimizer"]
@ -147,9 +147,7 @@ def pretrain(
progress = tracker.update(epoch, loss, docs)
if progress:
msg.row(progress, **row_settings)
if P_cfg["n_save_every"] and (
batch_id % P_cfg["n_save_every"] == 0
):
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
_save_model(epoch, is_temp=True)
_save_model(epoch)
tracker.epoch_loss = 0.0

View File

@ -59,8 +59,9 @@ def project_run(
for dep in cmd.get("deps", []):
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command?"
err_kwargs = {"exits": 1} if not dry else {}
msg.fail(err, **err_kwargs)
msg.fail(err, err_help, **err_kwargs)
with working_dir(project_dir) as current_dir:
rerun = check_rerun(current_dir, cmd)
if not rerun and not force:
@ -144,7 +145,7 @@ def run_commands(
if not silent:
print(f"Running command: {join_command(command)}")
if not dry:
run_command(command)
run_command(command, capture=False)
def validate_subcommand(

View File

@ -8,7 +8,11 @@ train = ""
dev = ""
[system]
use_pytorch_for_gpu_memory = {{ "true" if use_transformer else "false" }}
{% if use_transformer -%}
gpu_allocator = "pytorch"
{% else -%}
gpu_allocator = null
{% endif %}
[nlp]
lang = "{{ lang }}"
@ -173,6 +177,18 @@ factory = "{{ pipe }}"
{% endif %}
{% endfor %}
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
[training]
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
vectors = null
@ -182,11 +198,12 @@ vectors = "{{ word_vectors }}"
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
{% endif %}
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
[training.optimizer]
@optimizers = "Adam.v1"
{% if use_transformer -%}
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
@ -195,16 +212,6 @@ total_steps = 20000
initial_rate = 5e-5
{% endif %}
[training.train_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }}
[training.dev_corpus]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
{% if use_transformer %}
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"

View File

@ -6,8 +6,7 @@ from pathlib import Path
from wasabi import msg
import thinc
import thinc.schedules
from thinc.api import use_pytorch_for_gpu_memory, require_gpu, fix_random_seed
from thinc.api import Config, Optimizer
from thinc.api import Config, Optimizer, require_gpu, fix_random_seed, set_gpu_allocator
import random
import typer
import logging
@ -18,6 +17,7 @@ from ..language import Language
from .. import util
from ..training.example import Example
from ..errors import Errors
from ..util import dot_to_object
@app.command(
@ -28,7 +28,7 @@ def train_cli(
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
output_path: Optional[Path] = Opt(None, "--output", "--output-path", "-o", help="Output directory to store trained pipeline in"),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
resume: bool = Opt(False, "--resume", "-R", help="Resume training"),
@ -78,22 +78,23 @@ def train(
config = util.load_config(
config_path, overrides=config_overrides, interpolate=True
)
if config.get("training", {}).get("seed") is not None:
if config["training"]["seed"] is not None:
fix_random_seed(config["training"]["seed"])
if config.get("system", {}).get("use_pytorch_for_gpu_memory"):
# It feels kind of weird to not have a default for this.
use_pytorch_for_gpu_memory()
allocator = config["training"]["gpu_allocator"]
if use_gpu >= 0 and allocator:
set_gpu_allocator(allocator)
# Use original config here before it's resolved to functions
sourced_components = get_sourced_components(config)
with show_validation_error(config_path):
nlp, config = util.load_model_from_config(config)
util.load_vocab_data_into_model(nlp, lookups=config["training"]["lookups"])
if config["training"]["vectors"] is not None:
util.load_vectors_into_model(nlp, config["training"]["vectors"])
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
T_cfg = config["training"]
optimizer = T_cfg["optimizer"]
train_corpus = T_cfg["train_corpus"]
dev_corpus = T_cfg["dev_corpus"]
train_corpus = dot_to_object(config, T_cfg["train_corpus"])
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
batcher = T_cfg["batcher"]
train_logger = T_cfg["logger"]
# Components that shouldn't be updated during training

View File

@ -6,13 +6,12 @@ init_tok2vec = null
[system]
seed = 0
use_pytorch_for_gpu_memory = false
gpu_allocator = null
[nlp]
lang = null
pipeline = []
disabled = []
load_vocab_data = true
before_creation = null
after_creation = null
after_pipeline_creation = null
@ -22,29 +21,10 @@ after_pipeline_creation = null
[components]
# Training hyper-parameters and additional features.
[training]
seed = ${system.seed}
dropout = 0.1
accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
# Control how scores are printed and checkpoints are evaluated.
score_weights = {}
# Names of pipeline components that shouldn't be updated during training
frozen_components = []
# Readers for corpora like dev and train.
[corpora]
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
[training.train_corpus]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
# Whether to train on sequences with 'gold standard' sentence boundaries
@ -56,7 +36,7 @@ max_length = 0
# Limitation on number of training examples
limit = 0
[training.dev_corpus]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
# Whether to train on sequences with 'gold standard' sentence boundaries
@ -68,6 +48,34 @@ max_length = 0
# Limitation on number of training examples
limit = 0
# Training hyper-parameters and additional features.
[training]
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
# Extra resources for transfer-learning or pseudo-rehearsal
init_tok2vec = ${paths.init_tok2vec}
raw_text = ${paths.raw}
vectors = null
lookups = null
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
# Control how scores are printed and checkpoints are evaluated.
score_weights = {}
# Names of pipeline components that shouldn't be updated during training
frozen_components = []
# Location in the config where the dev corpus is defined
dev_corpus = "corpora.dev"
# Location in the config where the train corpus is defined
train_corpus = "corpora.train"
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false

View File

@ -4,6 +4,7 @@ dropout = 0.2
n_save_every = null
component = "tok2vec"
layer = ""
corpus = "corpora.pretrain"
[pretraining.batcher]
@batchers = "spacy.batch_by_words.v1"
@ -12,13 +13,6 @@ discard_oversize = false
tolerance = 0.2
get_length = null
[pretraining.corpus]
@readers = "spacy.JsonlReader.v1"
path = ${paths.raw}
min_length = 5
max_length = 500
limit = 0
[pretraining.objective]
type = "characters"
n_characters = 4
@ -33,3 +27,12 @@ grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
[corpora]
[corpora.pretrain]
@readers = "spacy.JsonlReader.v1"
path = ${paths.raw}
min_length = 5
max_length = 500
limit = 0

View File

@ -119,9 +119,6 @@ class Warnings:
W105 = ("As of spaCy v3.0, the {matcher}.pipe method is deprecated. If you "
"need to match on a stream of documents, you can use nlp.pipe and "
"call the {matcher} on each Doc object.")
W106 = ("Both HEAD and SENT_START are included as attributes in "
"doc.from_array(). The parse trees based on the HEAD attribute "
"will override the values in SENT_START.")
W107 = ("The property Doc.{prop} is deprecated. Use "
"Doc.has_annotation(\"{attr}\") instead.")

View File

@ -31,6 +31,7 @@ from .schemas import ConfigSchema
from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups
# This is the base config will all settings (training etc.)
@ -86,6 +87,13 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
return tokenizer_factory
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
lookups = load_lookups(lang=lang, tables=tables)
return lookups
class Language:
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
@ -148,12 +156,7 @@ class Language:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True:
vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab(
self.lang,
self.Defaults,
vectors_name=vectors_name,
load_data=self._config["nlp"]["load_vocab_data"],
)
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -1455,7 +1458,7 @@ class Language:
# here :(
for i, (name1, proc1) in enumerate(self.pipeline):
if hasattr(proc1, "find_listeners"):
for name2, proc2 in self.pipeline[i+1:]:
for name2, proc2 in self.pipeline[i + 1 :]:
if isinstance(getattr(proc2, "model", None), Model):
proc1.find_listeners(proc2.model)

View File

@ -164,7 +164,9 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool):
def CharacterEmbed(
width: int, rows: int, nM: int, nC: int, also_use_static_vectors: bool
):
"""Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for
each word, taken from the beginning and end of the word equally. Padding is
@ -202,9 +204,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
),
StaticVectors(width, dropout=0.0),
),
with_array(Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)),
with_array(
Maxout(width, nM * nC + (2 * width), nP=3, normalize=True, dropout=0.0)
),
ragged2list(),
)
)
else:
model = chain(
concatenate(
@ -215,9 +219,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int, also_use_static_vect
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
),
),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
with_array(
Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)
),
ragged2list(),
)
)
return model

View File

@ -181,9 +181,9 @@ class TextCategorizer(Pipe):
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
"""
tensors = [doc.tensor for doc in docs]
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
tensors = [doc.tensor for doc in docs]
xp = get_array_module(tensors)
scores = xp.zeros((len(docs), len(self.labels)))
return scores

View File

@ -1,4 +1,4 @@
from typing import Dict, List, Union, Optional, Sequence, Any, Callable, Type, Tuple
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator
@ -8,6 +8,7 @@ from collections import defaultdict
from thinc.api import Optimizer
from .attrs import NAMES
from .lookups import Lookups
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
@ -104,7 +105,7 @@ class TokenPatternOperator(str, Enum):
StringValue = Union[TokenPatternString, StrictStr]
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
UnderscoreValue = Union[
TokenPatternString, TokenPatternNumber, str, int, float, list, bool,
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
]
@ -198,8 +199,9 @@ class ModelMetaSchema(BaseModel):
class ConfigSchemaTraining(BaseModel):
# fmt: off
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
train_corpus: Reader = Field(..., title="Reader for the training data")
dev_corpus: Reader = Field(..., title="Reader for the dev data")
lookups: Optional[Lookups] = Field(..., title="Vocab lookups")
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
dropout: StrictFloat = Field(..., title="Dropout rate")
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
@ -207,6 +209,7 @@ class ConfigSchemaTraining(BaseModel):
max_steps: StrictInt = Field(..., title="Maximum number of update steps to train for")
eval_frequency: StrictInt = Field(..., title="How often to evaluate during training (steps)")
seed: Optional[StrictInt] = Field(..., title="Random seed")
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
@ -227,7 +230,6 @@ class ConfigSchemaNlp(BaseModel):
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
@ -249,11 +251,11 @@ class ConfigSchemaPretrain(BaseModel):
dropout: StrictFloat = Field(..., title="Dropout rate")
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
optimizer: Optimizer = Field(..., title="The optimizer to use")
corpus: Reader = Field(..., title="Reader for the training data")
corpus: StrictStr = Field(..., title="Path in the config to the training data")
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
# TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
# fmt: on
@ -268,6 +270,7 @@ class ConfigSchema(BaseModel):
nlp: ConfigSchemaNlp
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
components: Dict[str, Dict[str, Any]]
corpora: Dict[str, Reader]
@root_validator(allow_reuse=True)
def validate_config(cls, values):

View File

@ -270,6 +270,18 @@ class Scorer:
for example in examples:
pred_doc = example.predicted
gold_doc = example.reference
# TODO
# This is a temporary hack to work around the problem that the scorer
# fails if you have examples that are not fully annotated for all
# the tasks in your pipeline. For instance, you might have a corpus
# of NER annotations that does not set sentence boundaries, but the
# pipeline includes a parser or senter, and then the score_weights
# are used to evaluate that component. When the scorer attempts
# to read the sentences from the gold document, it fails.
try:
list(getter(gold_doc, attr))
except ValueError:
continue
# Find all labels in gold and doc
labels = set(
[k.label_ for k in getter(gold_doc, attr)]

View File

@ -274,12 +274,11 @@ def test_doc_from_array_sent_starts(en_vocab):
# fmt: on
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
# HEAD overrides SENT_START with warning
# HEAD overrides SENT_START without warning
attrs = [SENT_START, HEAD]
arr = doc.to_array(attrs)
new_doc = Doc(en_vocab, words=words)
with pytest.warns(UserWarning):
new_doc.from_array(attrs, arr)
new_doc.from_array(attrs, arr)
# no warning using default attrs
attrs = doc._get_array_attrs()

View File

@ -92,7 +92,12 @@ def test_spans_span_sent(doc, doc_not_parsed):
def test_spans_lca_matrix(en_tokenizer):
"""Test span's lca matrix generation"""
tokens = en_tokenizer("the lazy dog slept")
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0], deps=["dep"] * 4)
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=[2, 1, 1, 0],
deps=["dep"] * 4,
)
lca = doc[:2].get_lca_matrix()
assert lca.shape == (2, 2)
assert lca[0, 0] == 0 # the & the -> the

View File

@ -63,7 +63,12 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=["dep"] * len(heads))
doc = get_doc(
tokens.vocab,
words=[t.text for t in tokens],
heads=heads,
deps=["dep"] * len(heads),
)
lefts = {}
rights = {}

View File

@ -345,10 +345,7 @@ def test_language_factories_invalid():
[{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
{"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
),
(
[{"a": 0.5, "b": 0.5}, {"b": 1.0}],
{"a": 0.25, "b": 0.75},
),
([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75},),
],
)
def test_language_factories_combine_score_weights(weights, expected):
@ -363,16 +360,10 @@ def test_language_factories_scores():
weights1 = {"a1": 0.5, "a2": 0.5}
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.factory(
f"{name}1",
scores=list(weights1),
default_score_weights=weights1,
func=func,
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
)
Language.factory(
f"{name}2",
scores=list(weights2),
default_score_weights=weights2,
func=func,
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
)
meta1 = Language.get_factory_meta(f"{name}1")
assert meta1.default_score_weights == weights1

View File

@ -9,7 +9,7 @@ from spacy.tokens import Doc
from spacy.training import Example
from spacy import util
from spacy.lang.en import English
from .util import get_batch
from ..util import get_batch
from thinc.api import Config

View File

@ -212,9 +212,17 @@ def test_issue1834():
heads=[0, -1, -2, -3, -4, -5, 0, -1, -2],
deps=["dep"] * len(words),
)
print(doc.has_annotation("DEP"), [t.head.i for t in doc], [t.is_sent_start for t in doc])
print(
doc.has_annotation("DEP"),
[t.head.i for t in doc],
[t.is_sent_start for t in doc],
)
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
print(new_doc.has_annotation("DEP"), [t.head.i for t in new_doc], [t.is_sent_start for t in new_doc])
print(
new_doc.has_annotation("DEP"),
[t.head.i for t in new_doc],
[t.is_sent_start for t in new_doc],
)
assert new_doc[6].sent_start
assert new_doc.has_annotation("DEP")
assert new_doc.has_annotation("TAG")

View File

@ -17,16 +17,18 @@ nlp_config_string = """
train = ""
dev = ""
[training]
[corpora]
[training.train_corpus]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
[training.dev_corpus]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
[training]
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666
@ -300,20 +302,20 @@ def test_config_overrides():
def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["training"]["train_corpus"]["path"] == "${paths.train}"
assert config["corpora"]["train"]["path"] == "${paths.train}"
interpolated = config.interpolate()
assert interpolated["training"]["train_corpus"]["path"] == ""
assert interpolated["corpora"]["train"]["path"] == ""
nlp = English.from_config(config)
assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config
width = "${components.tok2vec.model.width}"
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate()
assert interpolated2["training"]["train_corpus"]["path"] == ""
assert interpolated2["corpora"]["train"]["path"] == ""
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
nlp2 = English.from_config(interpolated)
assert nlp2.config["training"]["train_corpus"]["path"] == ""
assert nlp2.config["corpora"]["train"]["path"] == ""
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342

View File

@ -136,7 +136,13 @@ def test_serialize_textcat_empty(en_vocab):
# See issue #1105
cfg = {"model": DEFAULT_TEXTCAT_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"]
textcat = TextCategorizer(en_vocab, model, labels=["ENTITY", "ACTION", "MODIFIER"], threshold=0.5, positive_label=None)
textcat = TextCategorizer(
en_vocab,
model,
labels=["ENTITY", "ACTION", "MODIFIER"],
threshold=0.5,
positive_label=None,
)
textcat.to_bytes(exclude=["vocab"])

View File

@ -3,7 +3,6 @@ from click import NoSuchOption
from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides

View File

@ -291,8 +291,7 @@ def test_spacy_blank():
@pytest.mark.parametrize(
"value",
[False, None, ["x", "y"], Language, Vocab],
"value", [False, None, ["x", "y"], Language, Vocab],
)
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"

View File

@ -69,7 +69,6 @@ def test_util_dot_section():
[nlp]
lang = "en"
pipeline = ["textcat"]
load_vocab_data = false
[components]
@ -95,15 +94,13 @@ def test_util_dot_section():
# not exclusive_classes
assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] is False
# Test that default values got overwritten
assert not en_config["nlp"]["load_vocab_data"]
assert nl_config["nlp"]["load_vocab_data"] # default value True
assert en_config["nlp"]["pipeline"] == ["textcat"]
assert nl_config["nlp"]["pipeline"] == [] # default value []
# Test proper functioning of 'dot_to_object'
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.pipeline.tagger")
with pytest.raises(KeyError):
dot_to_object(en_config, "nlp.unknownattribute")
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)

View File

View File

@ -0,0 +1,111 @@
from typing import Dict, Iterable, Callable
import pytest
from thinc.api import Config
from spacy import Language
from spacy.util import load_model_from_config, registry, dot_to_object
from spacy.training import Example
def test_readers():
config_string = """
[training]
[corpora]
@readers = "myreader.v1"
[nlp]
lang = "en"
pipeline = ["tok2vec", "textcat"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.textcat]
factory = "textcat"
"""
@registry.readers.register("myreader.v1")
def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
def reader(nlp: Language):
doc = nlp.make_doc(f"This is an example")
return [Example.from_dict(doc, annots)]
return {"train": reader, "dev": reader, "extra": reader, "something": reader}
config = Config().from_str(config_string)
nlp, resolved = load_model_from_config(config, auto_fill=True)
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
assert isinstance(train_corpus, Callable)
optimizer = resolved["training"]["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
nlp.update([example], sgd=optimizer)
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
scores = nlp.evaluate(list(dev_corpus(nlp)))
assert scores["cats_score"]
# ensure the pipeline runs
doc = nlp("Quick test")
assert doc.cats
extra_corpus = resolved["corpora"]["extra"]
assert isinstance(extra_corpus, Callable)
@pytest.mark.slow
@pytest.mark.parametrize(
"reader,additional_config",
[
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
],
)
def test_cat_readers(reader, additional_config):
nlp_config_string = """
[training]
[corpora]
@readers = "PLACEHOLDER"
[nlp]
lang = "en"
pipeline = ["tok2vec", "textcat"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.textcat]
factory = "textcat"
"""
config = Config().from_str(nlp_config_string)
config["corpora"]["@readers"] = reader
config["corpora"].update(additional_config)
nlp, resolved = load_model_from_config(config, auto_fill=True)
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
optimizer = resolved["training"]["optimizer"]
# simulate a training loop
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
for example in train_corpus(nlp):
assert example.y.cats
# this shouldn't fail if each training example has at least one positive label
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
nlp.update([example], sgd=optimizer)
# simulate performance benchmark on dev corpus
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
dev_examples = list(dev_corpus(nlp))
for example in dev_examples:
# this shouldn't fail if each dev example has at least one positive label
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
scores = nlp.evaluate(dev_examples)
assert scores["cats_score"]
# ensure the pipeline runs
doc = nlp("Quick test")
assert doc.cats

View File

@ -12,7 +12,7 @@ from thinc.api import compounding
import pytest
import srsly
from .util import make_tempdir, get_doc
from ..util import make_tempdir, get_doc
@pytest.fixture
@ -34,7 +34,17 @@ def doc():
# fmt: on
nlp = English()
words = [t.text for t in nlp.make_doc(text)]
doc = get_doc(nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents)
doc = get_doc(
nlp.vocab,
words=words,
tags=tags,
pos=pos,
morphs=morphs,
heads=heads,
deps=deps,
lemmas=lemmas,
ents=ents,
)
doc.cats = cats
return doc

View File

@ -830,8 +830,6 @@ cdef class Doc:
if array.dtype != numpy.uint64:
warnings.warn(Warnings.W028.format(type=array.dtype))
if set(attrs) != set(Doc._get_array_attrs()) and SENT_START in attrs and HEAD in attrs:
warnings.warn(Warnings.W106)
cdef int i, col
cdef int32_t abs_head_index
cdef attr_id_t attr_id

View File

@ -253,6 +253,14 @@ def load_vectors_into_model(
nlp.vocab.strings.add(vectors_nlp.vocab.strings[key])
def load_vocab_data_into_model(
nlp: "Language", *, lookups: Optional["Lookups"] = None
) -> None:
"""Load vocab data."""
if lookups:
nlp.vocab.lookups = lookups
def load_model(
name: Union[str, Path],
*,
@ -651,8 +659,8 @@ def join_command(command: List[str]) -> str:
def run_command(
command: Union[str, List[str]],
*,
capture: bool = False,
stdin: Optional[Any] = None,
capture: bool = False,
) -> Optional[subprocess.CompletedProcess]:
"""Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed.
@ -660,33 +668,46 @@ def run_command(
command (str / List[str]): The command. If provided as a string, the
string will be split using shlex.split.
stdin (Optional[Any]): stdin to read from or None.
capture (bool): Whether to capture the output.
capture (bool): Whether to capture the output and errors. If False,
the stdout and stderr will not be redirected, and if there's an error,
sys.exit will be called with the returncode. You should use capture=False
when you want to turn over execution to the command, and capture=True
when you want to run the command more like a function.
RETURNS (Optional[CompletedProcess]): The process object.
"""
if isinstance(command, str):
command = split_command(command)
cmd_list = split_command(command)
cmd_str = command
else:
cmd_list = command
cmd_str = " ".join(command)
try:
ret = subprocess.run(
command,
cmd_list,
env=os.environ.copy(),
input=stdin,
encoding="utf8",
check=True,
check=False,
stdout=subprocess.PIPE if capture else None,
stderr=subprocess.PIPE if capture else None,
stderr=subprocess.STDOUT if capture else None,
)
except FileNotFoundError:
# Indicates the *command* wasn't found, it's an error before the command
# is run.
raise FileNotFoundError(
Errors.E970.format(str_command=" ".join(command), tool=command[0])
Errors.E970.format(str_command=cmd_str, tool=cmd_list[0])
) from None
except subprocess.CalledProcessError as e:
# We don't want a duplicate traceback here so we're making sure the
# CalledProcessError isn't re-raised. We also print both the string
# message and the stderr, in case the error only has one of them.
print(e.stderr)
print(e)
sys.exit(1)
if ret.returncode != 0:
if ret.returncode != 0 and capture:
message = f"Error running command:\n\n{cmd_str}\n\n"
message += f"Subprocess exited with status {ret.returncode}"
if ret.stdout is not None:
message += f"\n\nProcess log (stdout and stderr):\n\n"
message += ret.stdout
error = subprocess.SubprocessError(message)
error.ret = ret
error.command = cmd_str
raise error
elif ret.returncode != 0:
sys.exit(ret.returncode)
return ret

View File

@ -28,7 +28,7 @@ cdef class Vocab:
cpdef readonly StringStore strings
cpdef public Morphology morphology
cpdef public object vectors
cpdef public object lookups
cpdef public object _lookups
cpdef public object writing_system
cpdef public object get_noun_chunks
cdef readonly int length

View File

@ -22,14 +22,9 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None, load_data=True):
def create_vocab(lang, defaults, vectors_name=None):
# If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available
if load_data:
tables = ["lexeme_norm", "lexeme_prob", "lexeme_cluster", "lexeme_settings"]
lookups = load_lookups(lang, tables=tables, strict=False)
else:
lookups = Lookups()
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
# This is messy, but it's the minimal working fix to Issue #639.
lex_attrs[IS_STOP] = functools.partial(is_stop, stops=defaults.stop_words)
@ -38,11 +33,9 @@ def create_vocab(lang, defaults, vectors_name=None, load_data=True):
lex_attrs[NORM] = util.add_lookups(
lex_attrs.get(NORM, LEX_ATTRS[NORM]),
BASE_NORMS,
lookups.get_table("lexeme_norm", {}),
)
return Vocab(
lex_attr_getters=lex_attrs,
lookups=lookups,
writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
@ -424,6 +417,19 @@ cdef class Vocab:
orth = self.strings.add(orth)
return orth in self.vectors
property lookups:
def __get__(self):
return self._lookups
def __set__(self, lookups):
self._lookups = lookups
if lookups.has_table("lexeme_norm"):
self.lex_attr_getters[NORM] = util.add_lookups(
self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]),
self.lookups.get_table("lexeme_norm"),
)
def to_disk(self, path, *, exclude=tuple()):
"""Save the current state to a directory.

View File

@ -121,18 +121,19 @@ customize those settings in your config file later.
> ```
```cli
$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu]
$ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [--cpu] [--pretraining]
```
| Name | Description |
| ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | The config file for training. |
| Name | Description |
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
| `--cpu`, `-C` | Whether the model needs to run on CPU. This will impact the choice of architecture, pretrained weights and related hyperparameters. ~~bool (flag)~~ |
| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | The config file for training. |
### init fill-config {#init-fill-config new="3"}
@ -160,13 +161,14 @@ validation error with more details.
$ python -m spacy init fill-config [base_path] [output_file] [--diff]
```
| Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Complete and auto-filled config file for training. |
| Name | Description |
| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `base_path` | Path to base config to fill, e.g. generated by the [quickstart widget](/usage/training#quickstart). ~~Path (positional)~~ |
| `output_file` | Path to output `.cfg` file. If not set, the config is written to stdout so you can pipe it forward to a file. ~~Path (positional)~~ |
| `--pretraining`, `-pt` | Include config for pretraining (with [`spacy pretrain`](/api/cli#pretrain)). Defaults to `False`. ~~bool (flag)~~ |
| `--diff`, `-D` | Print a visual diff highlighting the changes. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Complete and auto-filled config file for training. |
### init vocab {#init-vocab new="3" tag="command"}
@ -272,7 +274,7 @@ training -> dropout field required
training -> optimizer field required
training -> optimize extra fields not permitted
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
If your config contains missing values, you can run the 'init fill-config'
command to fill in all the defaults, if possible:
@ -355,6 +357,16 @@ Registry @architectures
Name spacy.MaxoutWindowEncoder.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 207)
[corpora.dev]
Registry @readers
Name spacy.Corpus.v1
Module spacy.training.corpus
File /path/to/spacy/training/corpus.py (line 18)
[corpora.train]
Registry @readers
Name spacy.Corpus.v1
Module spacy.training.corpus
File /path/to/spacy/training/corpus.py (line 18)
[training.logger]
Registry @loggers
Name spacy.ConsoleLogger.v1
@ -370,11 +382,6 @@ Registry @schedules
Name compounding.v1
Module thinc.schedules
File /path/to/thinc/thinc/schedules.py (line 43)
[training.dev_corpus]
Registry @readers
Name spacy.Corpus.v1
Module spacy.training.corpus
File /path/to/spacy/training/corpus.py (line 18)
[training.optimizer]
Registry @optimizers
Name Adam.v1
@ -385,11 +392,6 @@ Registry @schedules
Name warmup_linear.v1
Module thinc.schedules
File /path/to/thinc/thinc/schedules.py (line 91)
[training.train_corpus]
Registry @readers
Name spacy.Corpus.v1
Module spacy.training.corpus
File /path/to/spacy/training/corpus.py (line 18)
```
</Accordion>
@ -761,6 +763,7 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [overrides
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
@ -796,11 +799,12 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |
| `--epoch-resume`, `-er` | The epoch to resume counting from when using `--resume-path`. Prevents unintended overwriting of existing weight files. ~~Optional[int] \(option)~~ |
| `--gpu-id`, `-g` | GPU ID or `-1` for CPU. Defaults to `-1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.dropout 0.2`. ~~Any (option/flag)~~ |
| **CREATES** | The pretrained weights that can be used to initialize `spacy train`. |
@ -891,8 +895,6 @@ what you need. By default, spaCy's
can provide any other repo (public or private) that you have access to using the
`--repo` option.
<!-- TODO: update example once we've decided on repo structure -->
```cli
$ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
```
@ -900,7 +902,7 @@ $ python -m spacy project clone [name] [dest] [--repo] [--branch] [--sparse]
> #### Example
>
> ```cli
> $ python -m spacy project clone some_example
> $ python -m spacy project clone pipelines/ner_wikiner
> ```
>
> Clone from custom repo:

View File

@ -26,7 +26,7 @@ streaming.
> [paths]
> train = "corpus/train.spacy"
>
> [training.train_corpus]
> [corpora.train]
> @readers = "spacy.Corpus.v1"
> path = ${paths.train}
> gold_preproc = false
@ -135,7 +135,7 @@ Initialize the reader.
>
> ```ini
> ### Example config
> [pretraining.corpus]
> [corpora.pretrain]
> @readers = "spacy.JsonlReader.v1"
> path = "corpus/raw_text.jsonl"
> min_length = 0

View File

@ -60,7 +60,6 @@ your config and check that it's valid, you can run the
> [nlp]
> lang = "en"
> pipeline = ["tagger", "parser", "ner"]
> load_vocab_data = true
> before_creation = null
> after_creation = null
> after_pipeline_creation = null
@ -77,7 +76,6 @@ Defines the `nlp` object, its tokenizer and
| `lang` | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a pipeline is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
@ -121,6 +119,62 @@ that you don't want to hard-code in your config file.
$ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
```
### corpora {#config-corpora tag="section"}
> #### Example
>
> ```ini
> [corpora]
>
> [corpora.train]
> @readers = "spacy.Corpus.v1"
> path = ${paths:train}
>
> [corpora.dev]
> @readers = "spacy.Corpus.v1"
> path = ${paths:dev}
>
> [corpora.pretrain]
> @readers = "spacy.JsonlReader.v1"
> path = ${paths.raw}
>
> [corpora.my_custom_data]
> @readers = "my_custom_reader.v1"
> ```
This section defines a **dictionary** mapping of string keys to functions. Each
function takes an `nlp` object and yields [`Example`](/api/example) objects. By
default, the two keys `train` and `dev` are specified and each refer to a
[`Corpus`](/api/top-level#Corpus). When pretraining, an additional `pretrain`
section is added that defaults to a [`JsonlReader`](/api/top-level#JsonlReader).
You can also register custom functions that return a callable.
| Name | Description |
| ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `train` | Training data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~ |
| `dev` | Development data corpus, typically used in `[training]` block. ~~Callable[[Language], Iterator[Example]]~~ |
| `pretrain` | Raw text for [pretraining](/usage/embeddings-transformers#pretraining), typically used in `[pretraining]` block (if available). ~~Callable[[Language], Iterator[Example]]~~ |
| ... | Any custom or alternative corpora. ~~Callable[[Language], Iterator[Example]]~~ |
Alternatively, the `[corpora]` block can refer to **one function** that returns
a dictionary keyed by the corpus names. This can be useful if you want to load a
single corpus once and then divide it up into `train` and `dev` partitions.
> #### Example
>
> ```ini
> [corpora]
> @readers = "my_custom_reader.v1"
> train_path = ${paths:train}
> dev_path = ${paths:dev}
> shuffle = true
>
> ```
| Name | Description |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `corpora` | A dictionary keyed by string names, mapped to corpus functions that receive the current `nlp` object and return an iterator of [`Example`](/api/example) objects. ~~Dict[str, Callable[[Language], Iterator[Example]]]~~ |
### training {#config-training tag="section"}
This section defines settings and controls for the training and evaluation
@ -130,11 +184,13 @@ process that are used when you run [`spacy train`](/api/cli#train).
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ |
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
| `lookups` | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~ |
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
| `max_steps` | Maximum number of update steps to train for. Defaults to `20000`. ~~int~~ |
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
@ -142,7 +198,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
### pretraining {#config-pretraining tag="section,optional"}
@ -151,17 +207,18 @@ This section is optional and defines settings and controls for
[language model pretraining](/usage/embeddings-transformers#pretraining). It's
used when you run [`spacy pretrain`](/api/cli#pretrain).
| Name | Description |
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
| `corpus` | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ |
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
| Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------ |
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
| |
## Training data {#training}
@ -418,7 +475,7 @@ lexical data.
Here's an example of the 20 most frequent lexemes in the English training data:
```json
%%GITHUB_SPACY / extra / example_data / vocab - data.jsonl
%%GITHUB_SPACY/extra/example_data/vocab-data.jsonl
```
## Pipeline meta {#meta}

View File

@ -84,7 +84,7 @@ Create a blank pipeline of a given language class. This function is the twin of
| _keyword-only_ | |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| `meta` <Tag variant="new">3</tag> | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ |
| `meta` <Tag variant="new">3</Tag> | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ |
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
### spacy.info {#spacy.info tag="function"}
@ -145,9 +145,10 @@ pipelines.
> nlp = spacy.load("en_core_web_sm")
> ```
| Name | Description |
| ----------- | --------------------------------------- |
| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
| Name | Description |
| ----------- | ------------------------------------------------ |
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
@ -164,9 +165,10 @@ and _before_ loading any pipelines.
> nlp = spacy.load("en_core_web_sm")
> ```
| Name | Description |
| ----------- | --------------- |
| **RETURNS** | `True` ~~bool~~ |
| Name | Description |
| ----------- | ------------------------------------------------ |
| `gpu_id` | Device index to select. Defaults to `0`. ~~int~~ |
| **RETURNS** | `True` ~~bool~~ |
## displaCy {#displacy source="spacy/displacy"}
@ -448,7 +450,7 @@ remain in the config file stored on your local system.
> [training.logger]
> @loggers = "spacy.WandbLogger.v1"
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
> ```
| Name | Description |
@ -456,6 +458,16 @@ remain in the config file stored on your local system.
| `project_name` | The name of the project in the Weights & Biases interface. The project will be created automatically if it doesn't exist yet. ~~str~~ |
| `remove_config_values` | A list of values to include from the config before it is uploaded to W&B (default: empty). ~~List[str]~~ |
<Project id="integrations/wandb">
Get started with tracking your spaCy training runs in Weights & Biases using our
project template. It trains on the IMDB Movie Review Dataset and includes a
simple config with the built-in `WandbLogger`, as well as a custom example of
creating variants of the config for a simple hyperparameter grid search and
logging the results.
</Project>
## Readers {#readers source="spacy/training/corpus.py" new="3"}
Corpus readers are registered functions that load data and return a function
@ -478,7 +490,7 @@ the [`Corpus`](/api/corpus) class.
> [paths]
> train = "corpus/train.spacy"
>
> [training.train_corpus]
> [corpora.train]
> @readers = "spacy.Corpus.v1"
> path = ${paths.train}
> gold_preproc = false
@ -506,7 +518,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
> [paths]
> pretrain = "corpus/raw_text.jsonl"
>
> [pretraining.corpus]
> [corpora.pretrain]
> @readers = "spacy.JsonlReader.v1"
> path = ${paths.pretrain}
> min_length = 0

View File

@ -30,14 +30,20 @@ to predict. Otherwise, you could try using a "one-shot learning" approach using
<Accordion title="Whats the difference between word vectors and language models?" id="vectors-vs-language-models">
The key difference between [word vectors](#word-vectors) and contextual language
models such as [transformers](#transformers) is that word vectors model
**lexical types**, rather than _tokens_. If you have a list of terms with no
context around them, a transformer model like BERT can't really help you. BERT
is designed to understand language **in context**, which isn't what you have. A
word vectors table will be a much better fit for your task. However, if you do
have words in context — whole sentences or paragraphs of running text — word
vectors will only provide a very rough approximation of what the text is about.
[Transformers](#transformers) are large and powerful neural networks that give
you better accuracy, but are harder to deploy in production, as they require a
GPU to run effectively. [Word vectors](#word-vectors) are a slightly older
technique that can give your models a smaller improvement in accuracy, and can
also provide some additional capabilities.
The key difference between word-vectors and contextual language models such as
transformers is that word vectors model **lexical types**, rather than _tokens_.
If you have a list of terms with no context around them, a transformer model
like BERT can't really help you. BERT is designed to understand language **in
context**, which isn't what you have. A word vectors table will be a much better
fit for your task. However, if you do have words in context — whole sentences or
paragraphs of running text — word vectors will only provide a very rough
approximation of what the text is about.
Word vectors are also very computationally efficient, as they map a word to a
vector with a single indexing operation. Word vectors are therefore useful as a
@ -283,8 +289,7 @@ of objects by referring to creation functions, including functions you register
yourself. For details on how to get started with training your own model, check
out the [training quickstart](/usage/training#quickstart).
<!-- TODO:
<Project id="en_core_trf_lg">
<!-- TODO: <Project id="en_core_trf_lg">
The easiest way to get started is to clone a transformers-based project
template. Swap in your data, edit the settings and hyperparameters and train,
@ -478,7 +483,32 @@ training.
## Static vectors {#static-vectors}
<!-- TODO: write -->
If your pipeline includes a **word vectors table**, you'll be able to use the
`.similarity()` method on the [`Doc`](/api/doc), [`Span`](/api/span),
[`Token`](/api/token) and [`Lexeme`](/api/lexeme) objects. You'll also be able
to access the vectors using the `.vector` attribute, or you can look up one or
more vectors directly using the [`Vocab`](/api/vocab) object. Pipelines with
word vectors can also **use the vectors as features** for the statistical
models, which can **improve the accuracy** of your components.
Word vectors in spaCy are "static" in the sense that they are not learned
parameters of the statistical models, and spaCy itself does not feature any
algorithms for learning word vector tables. You can train a word vectors table
using tools such as [Gensim](https://radimrehurek.com/gensim/),
[FastText](https://fasttext.cc/) or
[GloVe](https://nlp.stanford.edu/projects/glove/), or download existing
pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you
convert vectors for use with spaCy and will give you a directory you can load or
refer to in your [training configs](/usage/training#config).
<Infobox title="Word vectors and similarity" emoji="📖">
For more details on loading word vectors into spaCy, using them for similarity
and improving word vector coverage by truncating and pruning the vectors, see
the usage guide on
[word vectors and similarity](/usage/linguistic-features#vectors-similarity).
</Infobox>
### Using word vectors in your models {#word-vectors-models}
@ -579,33 +609,151 @@ def MyCustomVectors(
## Pretraining {#pretraining}
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
</Infobox>
The [`spacy pretrain`](/api/cli#pretrain) command lets you initialize your
models with **information from raw text**. Without pretraining, the models for
your components will usually be initialized randomly. The idea behind
pretraining is simple: random probably isn't optimal, so if we have some text to
learn from, we can probably find a way to get the model off to a better start.
<!--
- explain general concept and idea (short!)
- present it as a separate lightweight mechanism for pretraining the tok2vec
layer
- advantages (could also be pros/cons table)
- explain how it generates a separate file (!) and how it depends on the same
vectors
-->
Pretraining uses the same [`config.cfg`](/usage/training#config) file as the
regular training, which helps keep the settings and hyperparameters consistent.
The additional `[pretraining]` section has several configuration subsections
that are familiar from the training block: the `[pretraining.batcher]`,
`[pretraining.optimizer]` and `[pretraining.corpus]` all work the same way and
expect the same types of objects, although for pretraining your corpus does not
need to have any annotations, so you will often use a different reader, such as
the [`JsonlReader`](/api/top-level#jsonlreader).
> #### Raw text format
>
> The raw text can be provided as JSONL (newline-delimited JSON) with a key
> `"text"` per entry. This allows the data to be read in line by line, while
> also allowing you to include newlines in the texts.
> The raw text can be provided in spaCy's
> [binary `.spacy` format](/api/data-formats#training) consisting of serialized
> `Doc` objects or as a JSONL (newline-delimited JSON) with a key `"text"` per
> entry. This allows the data to be read in line by line, while also allowing
> you to include newlines in the texts.
>
> ```json
> {"text": "Can I ask where you work now and what you do, and if you enjoy it?"}
> {"text": "They may just pull out of the Seattle market completely, at least until they have autonomous vehicles."}
> ```
>
> You can also use your own custom corpus loader instead.
You can add a `[pretraining]` block to your config by setting the
`--pretraining` flag on [`init config`](/api/cli#init-config) or
[`init fill-config`](/api/cli#init-fill-config):
```cli
$ python -m spacy init fill-config config.cfg config_pretrain.cfg --pretraining
```
You can then run [`spacy pretrain`](/api/cli#pretrain) with the updated config
and pass in optional config overrides, like the path to the raw text file:
```cli
$ python -m spacy pretrain raw_text.jsonl /output config_pretrain.cfg
$ python -m spacy pretrain config_pretrain.cfg ./output --paths.raw text.jsonl
```
The following defaults are used for the `[pretraining]` block and merged into
your existing config when you run [`init config`](/api/cli#init-config) or
[`init fill-config`](/api/cli#init-fill-config) with `--pretraining`. If needed,
you can [configure](#pretraining-configure) the settings and hyperparameters or
change the [objective](#pretraining-details).
```ini
%%GITHUB_SPACY/spacy/default_config_pretraining.cfg
```
### How pretraining works {#pretraining-details}
The impact of [`spacy pretrain`](/api/cli#pretrain) varies, but it will usually
be worth trying if you're **not using a transformer** model and you have
**relatively little training data** (for instance, fewer than 5,000 sentences).
A good rule of thumb is that pretraining will generally give you a similar
accuracy improvement to using word vectors in your model. If word vectors have
given you a 10% error reduction, pretraining with spaCy might give you another
10%, for a 20% error reduction in total.
The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
subnetwork** within one of your components, and add additional layers to build a
network for a temporary task, that forces the model to learn something about
sentence structure and word cooccurrence statistics. Pretraining produces a
**binary weights file** that can be loaded back in at the start of training. The
weights file specifies an initial set of weights. Training then proceeds as
normal.
You can only pretrain one subnetwork from your pipeline at a time, and the
subnetwork must be typed ~~Model[List[Doc], List[Floats2d]]~~ (i.e. it has to be
a "tok2vec" layer). The most common workflow is to use the
[`Tok2Vec`](/api/tok2vec) component to create a shared token-to-vector layer for
several components of your pipeline, and apply pretraining to its whole model.
#### Configuring the pretraining {#pretraining-configure}
The [`spacy pretrain`](/api/cli#pretrain) command is configured using the
`[pretraining]` section of your [config file](/usage/training#config). The
`component` and `layer` settings tell spaCy how to **find the subnetwork** to
pretrain. The `layer` setting should be either the empty string (to use the
whole model), or a
[node reference](https://thinc.ai/docs/usage-models#model-state). Most of
spaCy's built-in model architectures have a reference named `"tok2vec"` that
will refer to the right layer.
```ini
### config.cfg
# 1. Use the whole model of the "tok2vec" component
[pretraining]
component = "tok2vec"
layer = ""
# 2. Pretrain the "tok2vec" node of the "textcat" component
[pretraining]
component = "textcat"
layer = "tok2vec"
```
#### Pretraining objectives {#pretraining-details}
Two pretraining objectives are available, both of which are variants of the
cloze task [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805) introduced
for BERT. The objective can be defined and configured via the
`[pretraining.objective]` config block.
> ```ini
> ### Characters objective
> [pretraining.objective]
> type = "characters"
> n_characters = 4
> ```
>
> ```ini
> ### Vectors objective
> [pretraining.objective]
> type = "vectors"
> loss = "cosine"
> ```
- **Characters:** The `"characters"` objective asks the model to predict some
number of leading and trailing UTF-8 bytes for the words. For instance,
setting `n_characters = 2`, the model will try to predict the first two and
last two characters of the word.
- **Vectors:** The `"vectors"` objective asks the model to predict the word's
vector, from a static embeddings table. This requires a word vectors model to
be trained and loaded. The vectors objective can optimize either a cosine or
an L2 loss. We've generally found cosine loss to perform better.
These pretraining objectives use a trick that we term **language modelling with
approximate outputs (LMAO)**. The motivation for the trick is that predicting an
exact word ID introduces a lot of incidental complexity. You need a large output
layer, and even then, the vocabulary is too large, which motivates tokenization
schemes that do not align to actual word boundaries. At the end of training, the
output layer will be thrown away regardless: we just want a task that forces the
network to model something about word cooccurrence statistics. Predicting
leading and trailing characters does that more than adequately, as the exact
word sequence could be recovered with high accuracy if the initial and trailing
characters are predicted accurately. With the vectors objective, the pretraining
is use the embedding space learned by an algorithm such as
[GloVe](https://nlp.stanford.edu/projects/glove/) or
[Word2vec](https://code.google.com/archive/p/word2vec/), allowing the model to
focus on the contextual modelling we actual care about.

View File

@ -45,7 +45,7 @@ spaCy v3.0 introduces transformer-based pipelines that bring spaCy's accuracy
right up to **current state-of-the-art**. You can also use a CPU-optimized
pipeline, which is less accurate but much cheaper to run.
<!-- TODO: -->
<!-- TODO: update benchmarks and intro -->
> #### Evaluation details
>
@ -68,6 +68,6 @@ our project template.
</Project>
<!-- ## Citing spaCy {#citation}
<!-- TODO: ## Citing spaCy {#citation}
<!-- TODO: update -->
-->

View File

@ -356,6 +356,18 @@ that training configs are complete and experiments fully reproducible.
</Infobox>
Note that when using a PyTorch or Tensorflow model, it is recommended to set the
GPU memory allocator accordingly. When `gpu_allocator` is set to "pytorch" or
"tensorflow" in the training config, cupy will allocate memory via those
respective libraries, preventing OOM errors when there's available memory
sitting in the other library's pool.
```ini
### config.cfg (excerpt)
[training]
gpu_allocator = "pytorch"
```
## Custom models with Thinc {#thinc}
Of course it's also possible to define the `Model` from the previous section
@ -477,7 +489,7 @@ with Model.define_operators({">>": chain}):
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
</Infobox>
<!-- TODO:
<!-- TODO: write trainable component section
- Interaction with `predict`, `get_loss` and `set_annotations`
- Initialization life-cycle with `begin_training`, correlation with add_label
Example: relation extraction component (implemented as project template)

View File

@ -381,8 +381,6 @@ and loading pipeline packages, the underlying functionality is entirely based on
native Python packaging. This allows your application to handle a spaCy pipeline
like any other package dependency.
<!-- TODO: reference relevant spaCy project -->
### Downloading and requiring package dependencies {#models-download}
spaCy's built-in [`download`](/api/cli#download) command is mostly intended as a

View File

@ -29,15 +29,13 @@ and share your results with your team. spaCy projects can be used via the new
![Illustration of project workflow and commands](../images/projects.svg)
<!-- TODO:
<Project id="some_example_project">
<Project id="pipelines/tagger_parser_ud">
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat
mattis pretium.
The easiest way to get started is to clone a project template and run it  for
example, this end-to-end template that lets you train a **part-of-speech
tagger** and **dependency parser** on a Universal Dependencies treebank.
</Project>
-->
spaCy projects make it easy to integrate with many other **awesome tools** in
the data science and machine learning ecosystem to track and manage your data
@ -65,10 +63,8 @@ project template and copies the files to a local directory. You can then run the
project, e.g. to train a pipeline and edit the commands and scripts to build
fully custom workflows.
<!-- TODO: update with real example project -->
```cli
python -m spacy project clone some_example_project
python -m spacy project clone pipelines/tagger_parser_ud
```
By default, the project will be cloned into the current working directory. You
@ -216,10 +212,8 @@ format, train a pipeline, evaluate it and export metrics, package it and spin up
a quick web demo. It looks pretty similar to a config file used to define CI
pipelines.
<!-- TODO: update with better (final) example -->
```yaml
https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands/project.yml
https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
```
| Section | Description |
@ -927,6 +921,14 @@ package is installed in the same environment as spaCy, it will automatically add
[parallel training](/usage/training#parallel-training) for more details on how
it works under the hood.
<Project id="integrations/ray">
Get started with parallel training using our project template. It trains a
simple model on a Universal Dependencies Treebank and lets you parallelize the
training with Ray.
</Project>
You can integrate [`spacy ray train`](/api/cli#ray-train) into your
`project.yml` just like the regular training command and pass it the config, and
optional output directory or remote storage URL and config overrides if needed.
@ -946,10 +948,6 @@ commands:
- "training/model-best"
```
<!-- TODO: <Project id="integrations/ray">
</Project> -->
---
### Weights & Biases {#wandb} <IntegrationLogo name="wandb" width={175} height="auto" align="right" />
@ -969,21 +967,19 @@ your results.
> [training.logger]
> @loggers = "spacy.WandbLogger.v1"
> project_name = "monitor_spacy_training"
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
> ```
![Screenshot: Visualized training results](../images/wandb1.jpg)
![Screenshot: Parameter importance using config values](../images/wandb2.jpg 'Parameter importance using config values')
<!-- TODO:
<Project id="integrations/wandb">
Get started with tracking your spaCy training runs in Weights & Biases using our
project template. It includes a simple config using the `WandbLogger`, as well
as a custom logger implementation you can adjust for your specific use case.
project template. It trains on the IMDB Movie Review Dataset and includes a
simple config with the built-in `WandbLogger`, as well as a custom example of
creating variants of the config for a simple hyperparameter grid search and
logging the results.
</Project>
-->

View File

@ -574,7 +574,7 @@ The directory will be created if it doesn't exist, and the whole pipeline data,
meta and configuration will be written out. To make the pipeline more convenient
to deploy, we recommend wrapping it as a [Python package](/api/cli#package).
<Accordion title="Whats the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config">
<Accordion title="Whats the difference between the config.cfg and meta.json?" spaced id="models-meta-vs-config" spaced>
When you save a pipeline in spaCy v3.0+, two files will be exported: a
[`config.cfg`](/api/data-formats#config) based on
@ -596,6 +596,15 @@ based on [`nlp.meta`](/api/language#meta).
</Accordion>
<Project id="pipelines/tagger_parser_ud">
The easiest way to get started with an end-to-end workflow is to clone a
[project template](/usage/projects) and run it  for example, this template that
lets you train a **part-of-speech tagger** and **dependency parser** on a
Universal Dependencies treebank and generates an installable Python package.
</Project>
### Generating a pipeline package {#models-generating}
<Infobox title="Important note" variant="warning">
@ -699,5 +708,3 @@ class and call [`from_disk`](/api/language#from_disk) instead.
```python
nlp = spacy.blank("en").from_disk("/path/to/data")
```
<!-- TODO: point to spaCy projects? -->

View File

@ -92,7 +92,7 @@ spaCy's binary `.spacy` format. You can either include the data paths in the
$ python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
```
<Accordion title="How are the config recommendations generated?" id="quickstart-source">
<Accordion title="How are the config recommendations generated?" id="quickstart-source" spaced>
The recommended config settings generated by the quickstart widget and the
[`init config`](/api/cli#init-config) command are based on some general **best
@ -112,6 +112,15 @@ as we run more experiments.
</Accordion>
<Project id="pipelines/tagger_parser_ud">
The easiest way to get started is to clone a [project template](/usage/projects)
and run it  for example, this end-to-end template that lets you train a
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
treebank.
</Project>
## Training config {#config}
Training config files include all **settings and hyperparameters** for training
@ -746,7 +755,7 @@ as **config settings** in this case, `source`.
> #### config.cfg
>
> ```ini
> [training.train_corpus]
> [corpora.train]
> @readers = "corpus_variants.v1"
> source = "s3://your_bucket/path/data.csv"
> ```
@ -886,9 +895,13 @@ cluster. If it's not set, Ray will run locally.
python -m spacy ray train config.cfg --n-workers 2
```
<!-- TODO: <Project id="integrations/ray">
<Project id="integrations/ray">
</Project> -->
Get started with parallel training using our project template. It trains a
simple model on a Universal Dependencies Treebank and lets you parallelize the
training with Ray.
</Project>
### How parallel training works {#parallel-training-details}

View File

@ -176,18 +176,16 @@ freely combine implementations from different frameworks into a single model.
### Manage end-to-end workflows with projects {#features-projects}
<!-- TODO: update example -->
> #### Example
>
> ```cli
> # Clone a project template
> $ python -m spacy project clone example
> $ cd example
> $ python -m spacy project clone pipelines/tagger_parser_ud
> $ cd tagger_parser_ud
> # Download data assets
> $ python -m spacy project assets
> # Run a workflow
> $ python -m spacy project run train
> $ python -m spacy project run all
> ```
spaCy projects let you manage and share **end-to-end spaCy workflows** for
@ -207,14 +205,6 @@ data, [Streamlit](/usage/projects#streamlit) for building interactive apps,
[Ray](/usage/projects#ray) for parallel training,
[Weights & Biases](/usage/projects#wandb) for experiment tracking, and more!
<!-- <Project id="some_example_project">
The easiest way to get started with an end-to-end training process is to clone a
[project](/usage/projects) template. Projects let you manage multi-step
workflows, from data preprocessing to training and packaging your pipeline.
</Project>-->
<Infobox title="Details & Documentation" emoji="📖" list>
- **Usage:** [spaCy projects](/usage/projects),
@ -224,6 +214,15 @@ workflows, from data preprocessing to training and packaging your pipeline.
</Infobox>
<Project id="pipelines/tagger_parser_ud">
The easiest way to get started is to clone a [project template](/usage/projects)
and run it  for example, this end-to-end template that lets you train a
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
treebank.
</Project>
### Parallel and distributed training with Ray {#features-parallel-training}
> #### Example
@ -710,6 +709,48 @@ nlp = spacy.blank("en")
+ nlp.add_pipe("ner", source=source_nlp)
```
#### Configuring pipeline components with settings {#migrating-configure-pipe}
Because pipeline components are now added using their string names, you won't
have to instantiate the [component classes](/api/#architecture-pipeline)
directly anynore. To configure the component, you can now use the `config`
argument on [`nlp.add_pipe`](/api/language#add_pipe).
> #### config.cfg (excerpt)
>
> ```ini
> [components.sentencizer]
> factory = "sentencizer"
> punct_chars = ["!", ".", "?"]
> ```
```diff
punct_chars = ["!", ".", "?"]
- sentencizer = Sentencizer(punct_chars=punct_chars)
+ sentencizer = nlp.add_pipe("sentencizer", config={"punct_chars": punct_chars})
```
The `config` corresponds to the component settings in the
[`config.cfg`](/usage/training#config-components) and will overwrite the default
config defined by the components.
<Infobox variant="warning" title="Important note on config values">
Config values you pass to components **need to be JSON-serializable** and can't
be arbitrary Python objects. Otherwise, the settings you provide can't be
represented in the `config.cfg` and spaCy has no way of knowing how to re-create
your component with the same settings when you load the pipeline back in. If you
need to pass arbitrary objects to a component, use a
[registered function](/usage/processing-pipelines#example-stateful-components):
```diff
- config = {"model": MyTaggerModel()}
+ config= {"model": {"@architectures": "MyTaggerModel"}}
tagger = nlp.add_pipe("tagger", config=config)
```
</Infobox>
### Adding match patterns {#migrating-matcher}
The [`Matcher.add`](/api/matcher#add),
@ -833,7 +874,14 @@ values. You can then use the auto-generated `config.cfg` for training:
+ python -m spacy train ./config.cfg --output ./output
```
<!-- TODO: project template -->
<Project id="pipelines/tagger_parser_ud">
The easiest way to get started is to clone a [project template](/usage/projects)
and run it  for example, this end-to-end template that lets you train a
**part-of-speech tagger** and **dependency parser** on a Universal Dependencies
treebank.
</Project>
#### Training via the Python API {#migrating-training-python}

View File

@ -12,6 +12,7 @@
"companyUrl": "https://explosion.ai",
"repo": "explosion/spaCy",
"modelsRepo": "explosion/spacy-models",
"projectsRepo": "explosion/projects/tree/v3",
"social": {
"twitter": "spacy_io",
"github": "explosion"

View File

@ -13,7 +13,7 @@ export default function Tag({ spaced = false, variant, tooltip, children }) {
const isValid = isString(children) && !isNaN(children)
const version = isValid ? Number(children).toFixed(1) : children
const tooltipText = `This feature is new and was introduced in spaCy v${version}`
// TODO: we probably want to handle this more elegantly, but the idea is
// We probably want to handle this more elegantly, but the idea is
// that we can hide tags referring to old versions
const major = isString(version) ? Number(version.split('.')[0]) : version
return major < MIN_VERSION ? null : (

View File

@ -10,6 +10,7 @@ const htmlToReactParser = new HtmlToReactParser()
const DEFAULT_BRANCH = 'develop'
export const repo = siteMetadata.repo
export const modelsRepo = siteMetadata.modelsRepo
export const projectsRepo = siteMetadata.projectsRepo
/**
* This is used to provide selectors for headings so they can be crawled by

View File

@ -15,6 +15,10 @@
background: transparent
resize: none
font: inherit
overflow: hidden
white-space: nowrap
text-overflow: ellipsis
margin-right: 1rem
.prefix
margin-right: 0.75em

View File

@ -30,7 +30,6 @@ import Benchmarks from 'usage/_benchmarks-models.md'
const CODE_EXAMPLE = `# pip install spacy
# python -m spacy download en_core_web_sm
import spacy
# Load English tokenizer, tagger, parser and NER
@ -120,7 +119,7 @@ const Landing = ({ data }) => {
</Li>
<Li>
Components for <strong>named entity</strong> recognition,
part-of-speech-tagging, dependency parsing, sentence segmentation,{' '}
part-of-speech tagging, dependency parsing, sentence segmentation,{' '}
<strong>text classification</strong>, lemmatization, morphological
analysis, entity linking and more
</Li>
@ -223,10 +222,11 @@ const Landing = ({ data }) => {
<br />
<br />
<br />
{/** TODO: update with actual example */}
<Project id="some_example">
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum
sodales lectus.
<Project id="pipelines/tagger_parser_ud" title="Get started">
The easiest way to get started is to clone a project template and run it
 for example, this template for training a{' '}
<strong>part-of-speech tagger</strong> and{' '}
<strong>dependency parser</strong> on a Universal Dependencies treebank.
</Project>
</LandingCol>
<LandingCol>

View File

@ -4,25 +4,29 @@ import CopyInput from '../components/copy'
import Infobox from '../components/infobox'
import Link from '../components/link'
import { InlineCode } from '../components/code'
import { projectsRepo } from '../components/util'
// TODO: move to meta?
const DEFAULT_REPO = 'https://github.com/explosion/projects/tree/v3'
const COMMAND = 'python -m spacy project clone'
export default function Project({ id, repo, children }) {
export default function Project({
title = 'Get started with a project template',
id,
repo,
children,
}) {
const repoArg = repo ? ` --repo ${repo}` : ''
const text = `${COMMAND} ${id}${repoArg}`
const url = `${repo || DEFAULT_REPO}/${id}`
const title = (
const url = `${repo || projectsRepo}/${id}`
const header = (
<>
Get started with a project template:{' '}
{title}:{' '}
<Link to={url}>
<InlineCode>{id}</InlineCode>
</Link>
</>
)
return (
<Infobox title={title} emoji="🪐">
<Infobox title={header} emoji="🪐">
{children}
<CopyInput text={text} prefix="$" />
</Infobox>