mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
Merge pull request #6078 from svlandeg/fix/corpus
This commit is contained in:
commit
a127fa475e
|
@ -6,7 +6,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.0a31,<8.0.0a40",
|
"thinc>=8.0.0a33,<8.0.0a40",
|
||||||
"blis>=0.4.0,<0.5.0",
|
"blis>=0.4.0,<0.5.0",
|
||||||
"pytokenizations",
|
"pytokenizations",
|
||||||
"pathy"
|
"pathy"
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a31,<8.0.0a40
|
thinc>=8.0.0a33,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
ml_datasets>=0.1.1
|
ml_datasets==0.2.0a0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.0a31,<8.0.0a40
|
thinc>=8.0.0a33,<8.0.0a40
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0a31,<8.0.0a40
|
thinc>=8.0.0a33,<8.0.0a40
|
||||||
blis>=0.4.0,<0.5.0
|
blis>=0.4.0,<0.5.0
|
||||||
wasabi>=0.8.0,<1.1.0
|
wasabi>=0.8.0,<1.1.0
|
||||||
srsly>=2.1.0,<3.0.0
|
srsly>=2.1.0,<3.0.0
|
||||||
|
|
|
@ -20,6 +20,7 @@ from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..attrs import ID
|
from ..attrs import ID
|
||||||
from .. import util
|
from .. import util
|
||||||
|
from ..util import dot_to_object
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -70,9 +71,7 @@ def pretrain_cli(
|
||||||
|
|
||||||
with show_validation_error(config_path):
|
with show_validation_error(config_path):
|
||||||
config = util.load_config(
|
config = util.load_config(
|
||||||
config_path,
|
config_path, overrides=config_overrides, interpolate=True
|
||||||
overrides=config_overrides,
|
|
||||||
interpolate=True
|
|
||||||
)
|
)
|
||||||
if not config.get("pretraining"):
|
if not config.get("pretraining"):
|
||||||
# TODO: What's the solution here? How do we handle optional blocks?
|
# TODO: What's the solution here? How do we handle optional blocks?
|
||||||
|
@ -83,7 +82,7 @@ def pretrain_cli(
|
||||||
|
|
||||||
config.to_disk(output_dir / "config.cfg")
|
config.to_disk(output_dir / "config.cfg")
|
||||||
msg.good("Saved config file in the output directory")
|
msg.good("Saved config file in the output directory")
|
||||||
|
|
||||||
pretrain(
|
pretrain(
|
||||||
config,
|
config,
|
||||||
output_dir,
|
output_dir,
|
||||||
|
@ -98,7 +97,7 @@ def pretrain(
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
resume_path: Optional[Path] = None,
|
resume_path: Optional[Path] = None,
|
||||||
epoch_resume: Optional[int] = None,
|
epoch_resume: Optional[int] = None,
|
||||||
use_gpu: int=-1
|
use_gpu: int = -1,
|
||||||
):
|
):
|
||||||
if config["system"].get("seed") is not None:
|
if config["system"].get("seed") is not None:
|
||||||
fix_random_seed(config["system"]["seed"])
|
fix_random_seed(config["system"]["seed"])
|
||||||
|
@ -106,7 +105,7 @@ def pretrain(
|
||||||
use_pytorch_for_gpu_memory()
|
use_pytorch_for_gpu_memory()
|
||||||
nlp, config = util.load_model_from_config(config)
|
nlp, config = util.load_model_from_config(config)
|
||||||
P_cfg = config["pretraining"]
|
P_cfg = config["pretraining"]
|
||||||
corpus = P_cfg["corpus"]
|
corpus = dot_to_object(config, P_cfg["corpus"])
|
||||||
batcher = P_cfg["batcher"]
|
batcher = P_cfg["batcher"]
|
||||||
model = create_pretraining_model(nlp, config["pretraining"])
|
model = create_pretraining_model(nlp, config["pretraining"])
|
||||||
optimizer = config["pretraining"]["optimizer"]
|
optimizer = config["pretraining"]["optimizer"]
|
||||||
|
@ -147,9 +146,7 @@ def pretrain(
|
||||||
progress = tracker.update(epoch, loss, docs)
|
progress = tracker.update(epoch, loss, docs)
|
||||||
if progress:
|
if progress:
|
||||||
msg.row(progress, **row_settings)
|
msg.row(progress, **row_settings)
|
||||||
if P_cfg["n_save_every"] and (
|
if P_cfg["n_save_every"] and (batch_id % P_cfg["n_save_every"] == 0):
|
||||||
batch_id % P_cfg["n_save_every"] == 0
|
|
||||||
):
|
|
||||||
_save_model(epoch, is_temp=True)
|
_save_model(epoch, is_temp=True)
|
||||||
_save_model(epoch)
|
_save_model(epoch)
|
||||||
tracker.epoch_loss = 0.0
|
tracker.epoch_loss = 0.0
|
||||||
|
|
|
@ -173,6 +173,18 @@ factory = "{{ pipe }}"
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
max_length = 0
|
||||||
|
|
||||||
[training]
|
[training]
|
||||||
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
{% if use_transformer or optimize == "efficiency" or not word_vectors -%}
|
||||||
vectors = null
|
vectors = null
|
||||||
|
@ -182,11 +194,12 @@ vectors = "{{ word_vectors }}"
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
|
||||||
[training.optimizer]
|
[training.optimizer]
|
||||||
@optimizers = "Adam.v1"
|
@optimizers = "Adam.v1"
|
||||||
|
|
||||||
|
|
||||||
{% if use_transformer -%}
|
{% if use_transformer -%}
|
||||||
[training.optimizer.learn_rate]
|
[training.optimizer.learn_rate]
|
||||||
@schedules = "warmup_linear.v1"
|
@schedules = "warmup_linear.v1"
|
||||||
|
@ -195,16 +208,6 @@ total_steps = 20000
|
||||||
initial_rate = 5e-5
|
initial_rate = 5e-5
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.train}
|
|
||||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.dev}
|
|
||||||
max_length = 0
|
|
||||||
|
|
||||||
{% if use_transformer %}
|
{% if use_transformer %}
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "spacy.batch_by_padded.v1"
|
@batchers = "spacy.batch_by_padded.v1"
|
||||||
|
|
|
@ -18,6 +18,7 @@ from ..language import Language
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..training.example import Example
|
from ..training.example import Example
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
from ..util import dot_to_object
|
||||||
|
|
||||||
|
|
||||||
@app.command(
|
@app.command(
|
||||||
|
@ -92,8 +93,8 @@ def train(
|
||||||
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
raw_text, tag_map, morph_rules, weights_data = load_from_paths(config)
|
||||||
T_cfg = config["training"]
|
T_cfg = config["training"]
|
||||||
optimizer = T_cfg["optimizer"]
|
optimizer = T_cfg["optimizer"]
|
||||||
train_corpus = T_cfg["train_corpus"]
|
train_corpus = dot_to_object(config, T_cfg["train_corpus"])
|
||||||
dev_corpus = T_cfg["dev_corpus"]
|
dev_corpus = dot_to_object(config, T_cfg["dev_corpus"])
|
||||||
batcher = T_cfg["batcher"]
|
batcher = T_cfg["batcher"]
|
||||||
train_logger = T_cfg["logger"]
|
train_logger = T_cfg["logger"]
|
||||||
# Components that shouldn't be updated during training
|
# Components that shouldn't be updated during training
|
||||||
|
|
|
@ -22,6 +22,33 @@ after_pipeline_creation = null
|
||||||
|
|
||||||
[components]
|
[components]
|
||||||
|
|
||||||
|
# Readers for corpora like dev and train.
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
# and tokens. If you set this to true, take care to ensure your run-time
|
||||||
|
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||||
|
gold_preproc = false
|
||||||
|
# Limitations on training document length
|
||||||
|
max_length = 0
|
||||||
|
# Limitation on number of training examples
|
||||||
|
limit = 0
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||||
|
# and tokens. If you set this to true, take care to ensure your run-time
|
||||||
|
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||||
|
gold_preproc = false
|
||||||
|
# Limitations on training document length
|
||||||
|
max_length = 0
|
||||||
|
# Limitation on number of training examples
|
||||||
|
limit = 0
|
||||||
|
|
||||||
# Training hyper-parameters and additional features.
|
# Training hyper-parameters and additional features.
|
||||||
[training]
|
[training]
|
||||||
seed = ${system.seed}
|
seed = ${system.seed}
|
||||||
|
@ -40,33 +67,14 @@ eval_frequency = 200
|
||||||
score_weights = {}
|
score_weights = {}
|
||||||
# Names of pipeline components that shouldn't be updated during training
|
# Names of pipeline components that shouldn't be updated during training
|
||||||
frozen_components = []
|
frozen_components = []
|
||||||
|
# Location in the config where the dev corpus is defined
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
# Location in the config where the train corpus is defined
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
|
||||||
[training.logger]
|
[training.logger]
|
||||||
@loggers = "spacy.ConsoleLogger.v1"
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
|
||||||
[training.train_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.train}
|
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
|
||||||
# and tokens. If you set this to true, take care to ensure your run-time
|
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
|
||||||
gold_preproc = false
|
|
||||||
# Limitations on training document length
|
|
||||||
max_length = 0
|
|
||||||
# Limitation on number of training examples
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.dev_corpus]
|
|
||||||
@readers = "spacy.Corpus.v1"
|
|
||||||
path = ${paths.dev}
|
|
||||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
|
||||||
# and tokens. If you set this to true, take care to ensure your run-time
|
|
||||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
|
||||||
gold_preproc = false
|
|
||||||
# Limitations on training document length
|
|
||||||
max_length = 0
|
|
||||||
# Limitation on number of training examples
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
|
|
@ -4,6 +4,7 @@ dropout = 0.2
|
||||||
n_save_every = null
|
n_save_every = null
|
||||||
component = "tok2vec"
|
component = "tok2vec"
|
||||||
layer = ""
|
layer = ""
|
||||||
|
corpus = "corpora.pretrain"
|
||||||
|
|
||||||
[pretraining.batcher]
|
[pretraining.batcher]
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
|
@ -12,13 +13,6 @@ discard_oversize = false
|
||||||
tolerance = 0.2
|
tolerance = 0.2
|
||||||
get_length = null
|
get_length = null
|
||||||
|
|
||||||
[pretraining.corpus]
|
|
||||||
@readers = "spacy.JsonlReader.v1"
|
|
||||||
path = ${paths.raw}
|
|
||||||
min_length = 5
|
|
||||||
max_length = 500
|
|
||||||
limit = 0
|
|
||||||
|
|
||||||
[pretraining.objective]
|
[pretraining.objective]
|
||||||
type = "characters"
|
type = "characters"
|
||||||
n_characters = 4
|
n_characters = 4
|
||||||
|
@ -33,3 +27,12 @@ grad_clip = 1.0
|
||||||
use_averages = true
|
use_averages = true
|
||||||
eps = 1e-8
|
eps = 1e-8
|
||||||
learn_rate = 0.001
|
learn_rate = 0.001
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.pretrain]
|
||||||
|
@readers = "spacy.JsonlReader.v1"
|
||||||
|
path = ${paths.raw}
|
||||||
|
min_length = 5
|
||||||
|
max_length = 500
|
||||||
|
limit = 0
|
||||||
|
|
|
@ -181,9 +181,9 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
|
DOCS: https://nightly.spacy.io/api/textcategorizer#predict
|
||||||
"""
|
"""
|
||||||
tensors = [doc.tensor for doc in docs]
|
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle cases where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
|
tensors = [doc.tensor for doc in docs]
|
||||||
xp = get_array_module(tensors)
|
xp = get_array_module(tensors)
|
||||||
scores = xp.zeros((len(docs), len(self.labels)))
|
scores = xp.zeros((len(docs), len(self.labels)))
|
||||||
return scores
|
return scores
|
||||||
|
|
|
@ -104,7 +104,7 @@ class TokenPatternOperator(str, Enum):
|
||||||
StringValue = Union[TokenPatternString, StrictStr]
|
StringValue = Union[TokenPatternString, StrictStr]
|
||||||
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
NumberValue = Union[TokenPatternNumber, StrictInt, StrictFloat]
|
||||||
UnderscoreValue = Union[
|
UnderscoreValue = Union[
|
||||||
TokenPatternString, TokenPatternNumber, str, int, float, list, bool,
|
TokenPatternString, TokenPatternNumber, str, int, float, list, bool
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -198,8 +198,8 @@ class ModelMetaSchema(BaseModel):
|
||||||
class ConfigSchemaTraining(BaseModel):
|
class ConfigSchemaTraining(BaseModel):
|
||||||
# fmt: off
|
# fmt: off
|
||||||
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
vectors: Optional[StrictStr] = Field(..., title="Path to vectors")
|
||||||
train_corpus: Reader = Field(..., title="Reader for the training data")
|
dev_corpus: StrictStr = Field(..., title="Path in the config to the dev data")
|
||||||
dev_corpus: Reader = Field(..., title="Reader for the dev data")
|
train_corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||||
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
|
patience: StrictInt = Field(..., title="How many steps to continue without improvement in evaluation score")
|
||||||
|
@ -249,7 +249,7 @@ class ConfigSchemaPretrain(BaseModel):
|
||||||
dropout: StrictFloat = Field(..., title="Dropout rate")
|
dropout: StrictFloat = Field(..., title="Dropout rate")
|
||||||
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
|
n_save_every: Optional[StrictInt] = Field(..., title="Saving frequency")
|
||||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||||
corpus: Reader = Field(..., title="Reader for the training data")
|
corpus: StrictStr = Field(..., title="Path in the config to the training data")
|
||||||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||||
|
@ -268,6 +268,7 @@ class ConfigSchema(BaseModel):
|
||||||
nlp: ConfigSchemaNlp
|
nlp: ConfigSchemaNlp
|
||||||
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {}
|
||||||
components: Dict[str, Dict[str, Any]]
|
components: Dict[str, Dict[str, Any]]
|
||||||
|
corpora: Dict[str, Reader]
|
||||||
|
|
||||||
@root_validator(allow_reuse=True)
|
@root_validator(allow_reuse=True)
|
||||||
def validate_config(cls, values):
|
def validate_config(cls, values):
|
||||||
|
|
|
@ -9,7 +9,7 @@ from spacy.tokens import Doc
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from .util import get_batch
|
from ..util import get_batch
|
||||||
|
|
||||||
from thinc.api import Config
|
from thinc.api import Config
|
||||||
|
|
|
@ -17,16 +17,18 @@ nlp_config_string = """
|
||||||
train = ""
|
train = ""
|
||||||
dev = ""
|
dev = ""
|
||||||
|
|
||||||
[training]
|
[corpora]
|
||||||
|
|
||||||
[training.train_corpus]
|
[corpora.train]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.train}
|
path = ${paths.train}
|
||||||
|
|
||||||
[training.dev_corpus]
|
[corpora.dev]
|
||||||
@readers = "spacy.Corpus.v1"
|
@readers = "spacy.Corpus.v1"
|
||||||
path = ${paths.dev}
|
path = ${paths.dev}
|
||||||
|
|
||||||
|
[training]
|
||||||
|
|
||||||
[training.batcher]
|
[training.batcher]
|
||||||
@batchers = "spacy.batch_by_words.v1"
|
@batchers = "spacy.batch_by_words.v1"
|
||||||
size = 666
|
size = 666
|
||||||
|
@ -300,20 +302,20 @@ def test_config_overrides():
|
||||||
|
|
||||||
def test_config_interpolation():
|
def test_config_interpolation():
|
||||||
config = Config().from_str(nlp_config_string, interpolate=False)
|
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||||
assert config["training"]["train_corpus"]["path"] == "${paths.train}"
|
assert config["corpora"]["train"]["path"] == "${paths.train}"
|
||||||
interpolated = config.interpolate()
|
interpolated = config.interpolate()
|
||||||
assert interpolated["training"]["train_corpus"]["path"] == ""
|
assert interpolated["corpora"]["train"]["path"] == ""
|
||||||
nlp = English.from_config(config)
|
nlp = English.from_config(config)
|
||||||
assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
|
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
|
||||||
# Ensure that variables are preserved in nlp config
|
# Ensure that variables are preserved in nlp config
|
||||||
width = "${components.tok2vec.model.width}"
|
width = "${components.tok2vec.model.width}"
|
||||||
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||||
interpolated2 = nlp.config.interpolate()
|
interpolated2 = nlp.config.interpolate()
|
||||||
assert interpolated2["training"]["train_corpus"]["path"] == ""
|
assert interpolated2["corpora"]["train"]["path"] == ""
|
||||||
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
nlp2 = English.from_config(interpolated)
|
nlp2 = English.from_config(interpolated)
|
||||||
assert nlp2.config["training"]["train_corpus"]["path"] == ""
|
assert nlp2.config["corpora"]["train"]["path"] == ""
|
||||||
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
|
||||||
|
|
||||||
|
|
||||||
|
|
0
spacy/tests/training/__init__.py
Normal file
0
spacy/tests/training/__init__.py
Normal file
112
spacy/tests/training/test_readers.py
Normal file
112
spacy/tests/training/test_readers.py
Normal file
|
@ -0,0 +1,112 @@
|
||||||
|
from typing import Dict, Iterable, Callable
|
||||||
|
import pytest
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from spacy import Language
|
||||||
|
from spacy.util import load_model_from_config, registry, dot_to_object
|
||||||
|
from spacy.training import Example
|
||||||
|
|
||||||
|
|
||||||
|
def test_readers():
|
||||||
|
config_string = """
|
||||||
|
[training]
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
@readers = "myreader.v1"
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "textcat"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
"""
|
||||||
|
|
||||||
|
@registry.readers.register("myreader.v1")
|
||||||
|
def myreader() -> Dict[str, Callable[[Language, str], Iterable[Example]]]:
|
||||||
|
annots = {"cats": {"POS": 1.0, "NEG": 0.0}}
|
||||||
|
|
||||||
|
def reader(nlp: Language):
|
||||||
|
doc = nlp.make_doc(f"This is an example")
|
||||||
|
return [Example.from_dict(doc, annots)]
|
||||||
|
|
||||||
|
return {"train": reader, "dev": reader, "extra": reader, "something": reader}
|
||||||
|
|
||||||
|
config = Config().from_str(config_string)
|
||||||
|
nlp, resolved = load_model_from_config(config, auto_fill=True)
|
||||||
|
|
||||||
|
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
|
||||||
|
assert isinstance(train_corpus, Callable)
|
||||||
|
optimizer = resolved["training"]["optimizer"]
|
||||||
|
# simulate a training loop
|
||||||
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
|
for example in train_corpus(nlp):
|
||||||
|
nlp.update([example], sgd=optimizer)
|
||||||
|
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
|
||||||
|
scores = nlp.evaluate(list(dev_corpus(nlp)))
|
||||||
|
assert scores["cats_score"]
|
||||||
|
# ensure the pipeline runs
|
||||||
|
doc = nlp("Quick test")
|
||||||
|
assert doc.cats
|
||||||
|
extra_corpus = resolved["corpora"]["extra"]
|
||||||
|
assert isinstance(extra_corpus, Callable)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"reader,additional_config",
|
||||||
|
[
|
||||||
|
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
|
||||||
|
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
|
||||||
|
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_cat_readers(reader, additional_config):
|
||||||
|
nlp_config_string = """
|
||||||
|
[training]
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
@readers = "PLACEHOLDER"
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "textcat"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.textcat]
|
||||||
|
factory = "textcat"
|
||||||
|
"""
|
||||||
|
config = Config().from_str(nlp_config_string)
|
||||||
|
config["corpora"]["@readers"] = reader
|
||||||
|
config["corpora"].update(additional_config)
|
||||||
|
nlp, resolved = load_model_from_config(config, auto_fill=True)
|
||||||
|
|
||||||
|
train_corpus = dot_to_object(resolved, resolved["training"]["train_corpus"])
|
||||||
|
optimizer = resolved["training"]["optimizer"]
|
||||||
|
# simulate a training loop
|
||||||
|
nlp.begin_training(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
|
for example in train_corpus(nlp):
|
||||||
|
assert example.y.cats
|
||||||
|
# this shouldn't fail if each training example has at least one positive label
|
||||||
|
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
|
||||||
|
nlp.update([example], sgd=optimizer)
|
||||||
|
# simulate performance benchmark on dev corpus
|
||||||
|
dev_corpus = dot_to_object(resolved, resolved["training"]["dev_corpus"])
|
||||||
|
dev_examples = list(dev_corpus(nlp))
|
||||||
|
for example in dev_examples:
|
||||||
|
# this shouldn't fail if each dev example has at least one positive label
|
||||||
|
assert sorted(list(set(example.y.cats.values()))) == [0.0, 1.0]
|
||||||
|
scores = nlp.evaluate(dev_examples)
|
||||||
|
assert scores["cats_score"]
|
||||||
|
# ensure the pipeline runs
|
||||||
|
doc = nlp("Quick test")
|
||||||
|
assert doc.cats
|
|
@ -12,7 +12,7 @@ from thinc.api import compounding
|
||||||
import pytest
|
import pytest
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from .util import make_tempdir, get_doc
|
from ..util import make_tempdir, get_doc
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
|
@ -274,7 +274,7 @@ training -> dropout field required
|
||||||
training -> optimizer field required
|
training -> optimizer field required
|
||||||
training -> optimize extra fields not permitted
|
training -> optimize extra fields not permitted
|
||||||
|
|
||||||
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'dev_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}, 'train_corpus': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}}
|
{'vectors': 'en_vectors_web_lg', 'seed': 0, 'accumulate_gradient': 1, 'init_tok2vec': None, 'raw_text': None, 'patience': 1600, 'max_epochs': 0, 'max_steps': 20000, 'eval_frequency': 200, 'frozen_components': [], 'optimize': None, 'batcher': {'@batchers': 'spacy.batch_by_words.v1', 'discard_oversize': False, 'tolerance': 0.2, 'get_length': None, 'size': {'@schedules': 'compounding.v1', 'start': 100, 'stop': 1000, 'compound': 1.001, 't': 0.0}}, 'corpus': {'train': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}, 'dev': {'@readers': 'spacy.Corpus.v1', 'path': '', 'max_length': 0, 'gold_preproc': False, 'limit': 0}} 'score_weights': {'tag_acc': 0.5, 'dep_uas': 0.25, 'dep_las': 0.25, 'sents_f': 0.0}}
|
||||||
|
|
||||||
If your config contains missing values, you can run the 'init fill-config'
|
If your config contains missing values, you can run the 'init fill-config'
|
||||||
command to fill in all the defaults, if possible:
|
command to fill in all the defaults, if possible:
|
||||||
|
@ -357,6 +357,16 @@ Registry @architectures
|
||||||
Name spacy.MaxoutWindowEncoder.v1
|
Name spacy.MaxoutWindowEncoder.v1
|
||||||
Module spacy.ml.models.tok2vec
|
Module spacy.ml.models.tok2vec
|
||||||
File /path/to/spacy/ml/models/tok2vec.py (line 207)
|
File /path/to/spacy/ml/models/tok2vec.py (line 207)
|
||||||
|
ℹ [corpora.dev]
|
||||||
|
Registry @readers
|
||||||
|
Name spacy.Corpus.v1
|
||||||
|
Module spacy.training.corpus
|
||||||
|
File /path/to/spacy/training/corpus.py (line 18)
|
||||||
|
ℹ [corpora.train]
|
||||||
|
Registry @readers
|
||||||
|
Name spacy.Corpus.v1
|
||||||
|
Module spacy.training.corpus
|
||||||
|
File /path/to/spacy/training/corpus.py (line 18)
|
||||||
ℹ [training.logger]
|
ℹ [training.logger]
|
||||||
Registry @loggers
|
Registry @loggers
|
||||||
Name spacy.ConsoleLogger.v1
|
Name spacy.ConsoleLogger.v1
|
||||||
|
@ -372,11 +382,6 @@ Registry @schedules
|
||||||
Name compounding.v1
|
Name compounding.v1
|
||||||
Module thinc.schedules
|
Module thinc.schedules
|
||||||
File /path/to/thinc/thinc/schedules.py (line 43)
|
File /path/to/thinc/thinc/schedules.py (line 43)
|
||||||
ℹ [training.dev_corpus]
|
|
||||||
Registry @readers
|
|
||||||
Name spacy.Corpus.v1
|
|
||||||
Module spacy.training.corpus
|
|
||||||
File /path/to/spacy/training/corpus.py (line 18)
|
|
||||||
ℹ [training.optimizer]
|
ℹ [training.optimizer]
|
||||||
Registry @optimizers
|
Registry @optimizers
|
||||||
Name Adam.v1
|
Name Adam.v1
|
||||||
|
@ -387,11 +392,6 @@ Registry @schedules
|
||||||
Name warmup_linear.v1
|
Name warmup_linear.v1
|
||||||
Module thinc.schedules
|
Module thinc.schedules
|
||||||
File /path/to/thinc/thinc/schedules.py (line 91)
|
File /path/to/thinc/thinc/schedules.py (line 91)
|
||||||
ℹ [training.train_corpus]
|
|
||||||
Registry @readers
|
|
||||||
Name spacy.Corpus.v1
|
|
||||||
Module spacy.training.corpus
|
|
||||||
File /path/to/spacy/training/corpus.py (line 18)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
|
@ -26,7 +26,7 @@ streaming.
|
||||||
> [paths]
|
> [paths]
|
||||||
> train = "corpus/train.spacy"
|
> train = "corpus/train.spacy"
|
||||||
>
|
>
|
||||||
> [training.train_corpus]
|
> [corpora.train]
|
||||||
> @readers = "spacy.Corpus.v1"
|
> @readers = "spacy.Corpus.v1"
|
||||||
> path = ${paths.train}
|
> path = ${paths.train}
|
||||||
> gold_preproc = false
|
> gold_preproc = false
|
||||||
|
@ -135,7 +135,7 @@ Initialize the reader.
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> ### Example config
|
> ### Example config
|
||||||
> [pretraining.corpus]
|
> [corpora.pretrain]
|
||||||
> @readers = "spacy.JsonlReader.v1"
|
> @readers = "spacy.JsonlReader.v1"
|
||||||
> path = "corpus/raw_text.jsonl"
|
> path = "corpus/raw_text.jsonl"
|
||||||
> min_length = 0
|
> min_length = 0
|
||||||
|
|
|
@ -121,6 +121,55 @@ that you don't want to hard-code in your config file.
|
||||||
$ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
|
$ python -m spacy train config.cfg --paths.train ./corpus/train.spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### corpora {#config-corpora tag="section"}
|
||||||
|
|
||||||
|
This section defines a dictionary mapping of string keys to `Callable`
|
||||||
|
functions. Each callable takes an `nlp` object and yields
|
||||||
|
[`Example`](/api/example) objects. By default, the two keys `train` and `dev`
|
||||||
|
are specified and each refer to a [`Corpus`](/api/top-level#Corpus). When
|
||||||
|
pretraining, an additional pretrain section is added that defaults to a
|
||||||
|
[`JsonlReader`](/api/top-level#JsonlReader).
|
||||||
|
|
||||||
|
These subsections can be expanded with additional subsections, each referring to
|
||||||
|
a callback of type `Callable[[Language], Iterator[Example]]`:
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [corpora]
|
||||||
|
> [corpora.train]
|
||||||
|
> @readers = "spacy.Corpus.v1"
|
||||||
|
> path = ${paths:train}
|
||||||
|
>
|
||||||
|
> [corpora.dev]
|
||||||
|
> @readers = "spacy.Corpus.v1"
|
||||||
|
> path = ${paths:dev}
|
||||||
|
>
|
||||||
|
> [corpora.pretrain]
|
||||||
|
> @readers = "spacy.JsonlReader.v1"
|
||||||
|
> path = ${paths.raw}
|
||||||
|
> min_length = 5
|
||||||
|
> max_length = 500
|
||||||
|
>
|
||||||
|
> [corpora.mydata]
|
||||||
|
> @readers = "my_reader.v1"
|
||||||
|
> shuffle = true
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Alternatively, the `corpora` block could refer to one function with return type
|
||||||
|
`Dict[str, Callable[[Language], Iterator[Example]]]`:
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [corpora]
|
||||||
|
> @readers = "my_dict_reader.v1"
|
||||||
|
> train_path = ${paths:train}
|
||||||
|
> dev_path = ${paths:dev}
|
||||||
|
> shuffle = true
|
||||||
|
>
|
||||||
|
> ```
|
||||||
|
|
||||||
### training {#config-training tag="section"}
|
### training {#config-training tag="section"}
|
||||||
|
|
||||||
This section defines settings and controls for the training and evaluation
|
This section defines settings and controls for the training and evaluation
|
||||||
|
@ -130,7 +179,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||||
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
|
||||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ |
|
||||||
|
@ -142,7 +191,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
||||||
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
| `raw_text` | Optional path to a jsonl file with unlabelled text documents for a [rehearsal](/api/language#rehearse) step. Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
||||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ |
|
||||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. Defaults to [`Corpus`](/api/top-level#Corpus). ~~Callable[[Language], Iterator[Example]]~~ |
|
| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ |
|
||||||
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
| `vectors` | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~ |
|
||||||
|
|
||||||
### pretraining {#config-pretraining tag="section,optional"}
|
### pretraining {#config-pretraining tag="section,optional"}
|
||||||
|
@ -151,17 +200,18 @@ This section is optional and defines settings and controls for
|
||||||
[language model pretraining](/usage/embeddings-transformers#pretraining). It's
|
[language model pretraining](/usage/embeddings-transformers#pretraining). It's
|
||||||
used when you run [`spacy pretrain`](/api/cli#pretrain).
|
used when you run [`spacy pretrain`](/api/cli#pretrain).
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
||||||
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
| `max_epochs` | Maximum number of epochs. Defaults to `1000`. ~~int~~ |
|
||||||
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
||||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
||||||
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
||||||
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||||
| `corpus` | Callable that takes the current `nlp` object and yields [`Doc`](/api/doc) objects. Defaults to [`JsonlReader`](/api/top-level#JsonlReader). ~~Callable[[Language, str], Iterable[Example]]~~ |
|
| `corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.pretrain`. ~~str~~ |
|
||||||
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
| `batcher` | Batcher for the training data. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
|
||||||
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
| `component` | Component to find the layer to pretrain. Defaults to `"tok2vec"`. ~~str~~ |
|
||||||
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
|
| `layer` | The layer to pretrain. If empty, the whole component model will be used. ~~str~~ |
|
||||||
|
| |
|
||||||
|
|
||||||
## Training data {#training}
|
## Training data {#training}
|
||||||
|
|
||||||
|
|
|
@ -448,7 +448,7 @@ remain in the config file stored on your local system.
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.WandbLogger.v1"
|
> @loggers = "spacy.WandbLogger.v1"
|
||||||
> project_name = "monitor_spacy_training"
|
> project_name = "monitor_spacy_training"
|
||||||
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
@ -478,7 +478,7 @@ the [`Corpus`](/api/corpus) class.
|
||||||
> [paths]
|
> [paths]
|
||||||
> train = "corpus/train.spacy"
|
> train = "corpus/train.spacy"
|
||||||
>
|
>
|
||||||
> [training.train_corpus]
|
> [corpora.train]
|
||||||
> @readers = "spacy.Corpus.v1"
|
> @readers = "spacy.Corpus.v1"
|
||||||
> path = ${paths.train}
|
> path = ${paths.train}
|
||||||
> gold_preproc = false
|
> gold_preproc = false
|
||||||
|
@ -506,7 +506,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class.
|
||||||
> [paths]
|
> [paths]
|
||||||
> pretrain = "corpus/raw_text.jsonl"
|
> pretrain = "corpus/raw_text.jsonl"
|
||||||
>
|
>
|
||||||
> [pretraining.corpus]
|
> [corpora.pretrain]
|
||||||
> @readers = "spacy.JsonlReader.v1"
|
> @readers = "spacy.JsonlReader.v1"
|
||||||
> path = ${paths.pretrain}
|
> path = ${paths.pretrain}
|
||||||
> min_length = 0
|
> min_length = 0
|
||||||
|
|
|
@ -969,7 +969,7 @@ your results.
|
||||||
> [training.logger]
|
> [training.logger]
|
||||||
> @loggers = "spacy.WandbLogger.v1"
|
> @loggers = "spacy.WandbLogger.v1"
|
||||||
> project_name = "monitor_spacy_training"
|
> project_name = "monitor_spacy_training"
|
||||||
> remove_config_values = ["paths.train", "paths.dev", "training.dev_corpus.path", "training.train_corpus.path"]
|
> remove_config_values = ["paths.train", "paths.dev", "corpora.train.path", "corpora.dev.path"]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
![Screenshot: Visualized training results](../images/wandb1.jpg)
|
![Screenshot: Visualized training results](../images/wandb1.jpg)
|
||||||
|
|
|
@ -746,7 +746,7 @@ as **config settings** – in this case, `source`.
|
||||||
> #### config.cfg
|
> #### config.cfg
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training.train_corpus]
|
> [corpora.train]
|
||||||
> @readers = "corpus_variants.v1"
|
> @readers = "corpus_variants.v1"
|
||||||
> source = "s3://your_bucket/path/data.csv"
|
> source = "s3://your_bucket/path/data.csv"
|
||||||
> ```
|
> ```
|
||||||
|
|
Loading…
Reference in New Issue
Block a user